Sources:

1. https://www.kaggle.com/code/ciobeni/fake-news-prediction
2. https://excellencetechnologies.in/blog/bag-of-words-count-vectorizer/
3. https://www.analyticsvidhya.com/blog/2021/08/a-friendly-guide-to-nlp-bag-of-words-with-python-example/
4. https://www.tensorflow.org/tutorials/text/word2vec

# Importing relevant libraries

In [1]:
import os.path # Used for finding the path of the running script
import nltk # Used for preprocessing
import string # Used for preprocessing
import numpy as np # Used for saving
import pandas as pd # Used for dataframe

from nltk.corpus import stopwords # Used for preprocessing
from nltk.stem   import WordNetLemmatizer # Used for preprocessing
from nltk.tokenize import word_tokenize # Used for preprocessing
from sklearn.model_selection import train_test_split # Used for splitting the data
from sklearn.feature_extraction.text import CountVectorizer # Used for CountVectorizer feature extraction 
from sklearn.svm import SVC # Used for SVM modeling
from sklearn.model_selection import cross_validate # Used for cross-validation
from sklearn.metrics import classification_report # Used for model evaluation

import joblib # For saving the model

# Loading the data

In [3]:
# Loading the dataset

df = pd.read_csv('WELFake_Dataset.csv', index_col = 0)
df = df.dropna()

# Preprocessing the data

In [4]:
# Preprocessing

def preprocess_text(text):
    text = ''.join([c for c in text if c not in string.punctuation and c not in string.digits])
    tokens = word_tokenize(text, 'english')
    lemmatiser = WordNetLemmatizer()
    lemmatized = [lemmatiser.lemmatize(word) for word in tokens]
    sw = stopwords.words('english')
    stopped = [word for word in lemmatized if word.lower() not in sw]
    return stopped

In [5]:
# Splitting the data

X = df['title']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Feature Extraction: CountVectorizer

In [30]:
# Initializing and saving CountVectorizer matrix

X_train = X_train.fillna('')
vectorizer = CountVectorizer(analyzer=preprocess_text).fit(X_train)
matrix = vectorizer.transform(X_train) # Matrix is the CBOW feature extraction for X train

# Building the model: SVM

In [32]:
svm = SVC()
scores = cross_validate(svm, matrix, y_train, scoring=['f1', 'accuracy'], cv=5, n_jobs=-1) # Scoring the training

In [33]:
pd.DataFrame(scores).describe()

Unnamed: 0,fit_time,score_time,test_f1,test_accuracy
count,5.0,5.0,5.0,5.0
mean,595.141082,32.972316,0.948074,0.946129
std,57.98988,8.708964,0.001842,0.001923
min,491.602371,17.39941,0.946164,0.944085
25%,615.798861,36.551979,0.946722,0.944697
50%,619.65445,36.700785,0.947831,0.94609
75%,624.274728,37.097106,0.948854,0.946794
max,624.374998,37.112299,0.9508,0.948978


# Training the model

In [34]:
# Train the model using the whole training set
svm = SVC() # Initializing the model
svm.fit(matrix, y_train) # Training the model with the Y train data and X feature extraction data

In [37]:
joblib.dump(svm,'/Users/pietervanbrakel/Thesis/SVM_title.pkl',compress=3)

['/Users/pietervanbrakel/Thesis/SVM_title.pkl']

In [39]:
svm = joblib.load('/Users/pietervanbrakel/Thesis/SVM_title.pkl')

# Using test data

In [41]:
X_test = X_test.fillna('')
test_matrix = vectorizer.transform(X_test)
y_pred = svm.predict(test_matrix)

In [44]:
joblib.dump(test_matrix,'/Users/pietervanbrakel/Thesis/SVM_title_y_pred.pkl',compress=3)

['/Users/pietervanbrakel/Thesis/SVM_title_y_pred.pkl']

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      7025
           1       0.94      0.96      0.95      7283

    accuracy                           0.95     14308
   macro avg       0.95      0.95      0.95     14308
weighted avg       0.95      0.95      0.95     14308

