In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import re
import string

In [2]:
fake_news = pd.read_csv('DataSet_Misinfo_FAKE.csv')
not_fake_news = pd.read_csv('DataSet_Misinfo_TRUE.csv')
propaganda = pd.read_csv('EXTRA_RussianPropagandaSubset.csv')

In [3]:
fake_news['class'] = 0
not_fake_news['class'] = 1
propaganda['class'] = 2

In [4]:
data_merge = pd.concat([fake_news, not_fake_news, propaganda], axis = 0)

In [5]:
data_merge.columns

Index(['Unnamed: 0', 'text', 'class'], dtype='object')

In [6]:
data = data_merge.drop('Unnamed: 0', axis=1)

In [7]:
data.columns

Index(['text', 'class'], dtype='object')

In [8]:
data.isnull().sum()

text     32
class     0
dtype: int64

In [9]:
data = data.dropna(how='any',axis=0) 

In [10]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [11]:
data = data.sample(frac=1)

In [12]:
data.reset_index(inplace = True)

In [15]:
data.head()

Unnamed: 0,index,text,class
0,2910,Germany doubts that Russia is responsible for ...,2
1,839,Had there been a vote or a discussion [at WADA...,2
2,1038,An important feature of all-Russian history is...,2
3,7421,This is what Republicans have created by embra...,0
4,3069,U.S. House of Representatives Speaker Paul Rya...,1


In [16]:
data.drop('index', axis=1, inplace=True)

In [17]:
data.columns

Index(['text', 'class'], dtype='object')

In [18]:
def cleanNews(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [19]:
data['text'] = data['text'].apply(cleanNews)

In [20]:
x = data['text']
y = data['class']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [23]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=1000)
LR.fit(xv_train,y_train)

In [24]:
pred_lr = LR.predict(xv_test)
LR.score(xv_test, y_test)

0.8398888888888889

In [25]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4591
           1       0.93      0.91      0.92      3664
           2       0.37      0.26      0.31       745

    accuracy                           0.84      9000
   macro avg       0.71      0.68      0.69      9000
weighted avg       0.83      0.84      0.83      9000



In [26]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train,y_train)

In [27]:
pred_dt = DT.predict(xv_test)
DT.score(xv_test, y_test)

0.771

In [28]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.77      0.79      0.78      4591
           1       0.85      0.84      0.85      3664
           2       0.31      0.29      0.30       745

    accuracy                           0.77      9000
   macro avg       0.65      0.64      0.64      9000
weighted avg       0.77      0.77      0.77      9000



In [29]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train,y_train)

In [30]:
pred_gb = GB.predict(xv_test)
GB.score(xv_test, y_test)

0.833

In [31]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      4591
           1       0.91      0.89      0.90      3664
           2       0.44      0.28      0.34       745

    accuracy                           0.83      9000
   macro avg       0.72      0.68      0.70      9000
weighted avg       0.82      0.83      0.83      9000



In [32]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state = 0, n_jobs=-1)
RF.fit(xv_train,y_train)

In [33]:
pred_rf = RF.predict(xv_test)
RF.score(xv_test, y_test)

0.8098888888888889

In [34]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82      4591
           1       0.91      0.87      0.89      3664
           2       0.28      0.14      0.19       745

    accuracy                           0.81      9000
   macro avg       0.66      0.63      0.63      9000
weighted avg       0.79      0.81      0.80      9000



In [35]:
from sklearn.svm import SVC
SV = SVC()
SV.fit(xv_train,y_train)

In [None]:
pred_sv = SV.predict(xv_test)
SV.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_sv))

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNV.fit(xv_test, y_test)

In [None]:
pred_mnb = MNB.predict(xv_test)
MNB.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_mnb))

In [None]:
def output_label(n):
    if n==0:
        return "Fake News"
    elif n==1:
        return "Not a Fake News"
    else:
        return "Propaganda"

In [37]:
def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test['text'].apply(cleanNews)
    new_x_test = new_def_test['text']
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)
    pred_SV = SV.predict(new_xv_test)
    pred_MNB = MNB.predict(new_xv_test)
    return print(f"LR Prediction: {output_label(pred_LR[0])}\nDT Prediction: {output_label(pred_DT[0])}\nGB Prediction: {output_label(pred_GB[0])}\nRF Prediction: {output_label(pred_RF[0])}\nSVM Prediction: {output_label(pred_SV[0])}\nMNB Prediction: {output_label(pred_MNB[0])}\n")

In [None]:
news = input()
manual_testing(news)