In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import re
import string

#Data Preprocessing and cleaning

In [20]:
!unzip WELFake_Dataset.csv.zip
data = pd.read_csv('WELFake_Dataset.csv')

Archive:  WELFake_Dataset.csv.zip
replace WELFake_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: None


In [21]:
data = data.dropna()

In [22]:
#Function for cleaning the text
def clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [25]:
# Combining title and text
data['content'] = data['title'] + ' ' + data['text']

# Drop rows with missing values
data.dropna(inplace=True)

# Clean the data
data['content'] = data['content'].apply(clean)

In [26]:
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
x_train, x_test, y_train, y_test = train_test_split(data['content'], data['label'], test_size=0.2, random_state=42)

In [39]:
# Vectorizing the data with TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

In [25]:
## Vectorizing the data with Bag-of-Words
#count_vectorizer = CountVectorizer(stop_words='english')
#count_train = count_vectorizer.fit_transform(x_train)
#count_test = count_vectorizer.transform(x_test)

# Models


###Linear Regression

In [28]:
model_LR = LogisticRegression()
model_LR.fit(xv_train, y_train)

In [49]:
pred_LR = model_LR.predict(xv_test)
#model_LR.score(xv_test, y_test)
print(classification_report(y_test, pred_LR))
print(accuracy_score(y_test, pred_LR))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      7081
           1       0.94      0.96      0.95      7227

    accuracy                           0.95     14308
   macro avg       0.95      0.95      0.95     14308
weighted avg       0.95      0.95      0.95     14308

0.949538719597428


### Decision Tree

In [30]:
model_DT = DecisionTreeClassifier()
model_DT.fit(xv_train, y_train)

In [46]:
pred_DT = model_DT.predict(xv_test)
model_DT.score(xv_test, y_test)
print(classification_report(y_test, pred_DT))
#print(confusion_matrix(y_test, pred_DT))
print(accuracy_score(y_test, pred_DT))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      7081
           1       0.93      0.95      0.94      7227

    accuracy                           0.94     14308
   macro avg       0.94      0.94      0.94     14308
weighted avg       0.94      0.94      0.94     14308

0.940592675426335


###Naive Bayes Multinomial Classifier

In [32]:
model_nb_classifier = MultinomialNB()
model_nb_classifier.fit(xv_train, y_train)

In [33]:
pred_nb_classifier = model_nb_classifier.predict(xv_test)

In [34]:
print(classification_report(y_test, pred_nb_classifier))
print(accuracy_score(y_test, pred_nb_classifier))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86      7081
           1       0.86      0.88      0.87      7227

    accuracy                           0.87     14308
   macro avg       0.87      0.87      0.87     14308
weighted avg       0.87      0.87      0.87     14308

0.8667878110148168


### SVM

In [None]:
model_SVM = SVC()
model_SVM.fit(xv_train, y_train)

#Test Models on different dataset


In [35]:
#Read the new daset
new_data_fake = pd.read_csv('Fake.csv')
new_data_true = pd.read_csv('True.csv')

new_data_fake['label'] = 0
new_data_true['label'] = 1

#merge fake and true together
new_data_merge = pd.concat([new_data_fake, new_data_true], axis=0)

#Drop unnecessary columns
new_data = new_data_merge.drop(['subject', 'date'], axis=1)

#Combine title and text
new_data['content'] = data['title'] + ' ' + data['text']

#Drop title
new_data = data.drop(['title'], axis=1)

In [36]:
new_data['content'] = new_data['content'].apply(clean)


In [44]:
def test_models_on_new_dataset(dataset, models):

    x_new = dataset['content']
    y_new = dataset['label']

    #Using TF-IDF Vectorizer (fit it on the training data, and transform the new data)
    xv_new = vectorizer.transform(x_new)

    # Evaluate each model
    results = {}
    for model_name, model in models.items():
        predictions = model.predict(xv_new)
        accuracy = accuracy_score(y_new, predictions)
        results[model_name] = accuracy

    return results

In [45]:
models = {
    "Linear Regression": model_LR,
    "Decision Tree": model_DT,
    "Naive Bayes": model_nb_classifier
}

results = test_models_on_new_dataset(new_data, models)
print(results)

{'Linear Regression': 0.960607797363602, 'Decision Tree': 0.9881180368201071, 'Naive Bayes': 0.8769727553573675}
