### Support Vector Machine Model

In [1]:
#Importing modules
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
#Reading the cleaned training set 
train_df=pd.read_csv('full_train_cleaned.csv') 

Now we under-sample Fake News  


In [3]:
min_count = train_df['broad_category'].value_counts().min()

train_df = train_df.groupby('broad_category').apply(lambda x: x.sample(n=min_count, random_state=0)).reset_index(drop=True)

  train_df = train_df.groupby('broad_category').apply(lambda x: x.sample(n=min_count, random_state=0)).reset_index(drop=True)


In [4]:
#Checking distribution of types
train_df['broad_category'].value_counts()

broad_category
Fake News        359703
Reliable News    359703
Name: count, dtype: int64

In [None]:
#Reading the cleaned validation set 
val_df=pd.read_csv('full_val_cleaned.csv') 

In [6]:
#Defining the x-values and y-values to train the logistic regression.
x_train,y_train=train_df['content'],train_df['broad_category']

In [7]:
#Defining the x-values and y-values for the validation set.
x_val,y_val=val_df['content'],val_df['broad_category']

X_train_vec and X_test_vec contains text 'content'.
Using the module 'TfidfVectorizer', we can convert every text to a row of weighted numbers based on TF-IDF (Term Frequency-Inverse Document Frequency).
- Term Frequency (TF): How often a word appear in a given text.
- Inverse Document Frequency (IDF): How rarely a word appears compared to other texts.

In [8]:
##Initializing TfidfVectorizer from the sklearn module
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    ngram_range=(1,2))


In [9]:
x_train = x_train.astype(str).fillna("")
x_val = x_val.astype(str).fillna("")
#Fitting and transforming on the training set data
x_train_vec = vectorizer.fit_transform(x_train)

#Transforming the test data
x_val_vec = vectorizer.transform(x_val)

In [10]:
#Showing number of articles and number of unique words.
print(x_train_vec.shape)


(719406, 10000)


In [11]:
#Output from first line
print(x_train_vec[0])

  (0, 7024)	0.06152917415776399
  (0, 1797)	0.02392755462132701
  (0, 1722)	0.21408060432624557
  (0, 6176)	0.06361457629438895
  (0, 3493)	0.30027179832589
  (0, 109)	0.01980118420460243
  (0, 4370)	0.038963893072997635
  (0, 4244)	0.06587552498624634
  (0, 5950)	0.06096564327138656
  (0, 4431)	0.03821256817184202
  (0, 8247)	0.03630210715203922
  (0, 2629)	0.04080996430578347
  (0, 2878)	0.04027171986403256
  (0, 4273)	0.03977341134461181
  (0, 3257)	0.03678396421993639
  (0, 9387)	0.04354325669276316
  (0, 7736)	0.03821653225679985
  (0, 9606)	0.038336712917555575
  (0, 6763)	0.05815728905701774
  (0, 331)	0.038979671277988515
  (0, 392)	0.16107563889665663
  (0, 2822)	0.03705272465034596
  (0, 8620)	0.031246940892405022
  (0, 9202)	0.3076046843819442
  (0, 9141)	0.4161047718821851
  :	:
  (0, 6689)	0.023030509894982662
  (0, 4809)	0.021099248590601034
  (0, 1340)	0.023424860710438887
  (0, 160)	0.026224897050131507
  (0, 5870)	0.0311901503721837
  (0, 9311)	0.026536226276354123
  (

Explaination of output.
- Row (0): We are looking at the first text in the dataset.
- Column (154757): Representing a unique word in the vocalbulary.
- Value : Is the TF-IDF-weight, a high weight means that the word in this text is more important than the rest of the corpus.

In [12]:
from sklearn.linear_model import SGDClassifier

sgdmodel = SGDClassifier(
    loss='hinge',
    alpha=1e-6,
    max_iter=100000,
    n_jobs=-1,
    penalty='l1'
)

sgdmodel.fit(x_train_vec, y_train)


In [13]:
y_val_pred = sgdmodel.predict(x_val_vec)
print(classification_report(y_val,y_val_pred))

               precision    recall  f1-score   support

    Fake News       0.88      0.88      0.88     45501
Reliable News       0.88      0.88      0.88     44867

     accuracy                           0.88     90368
    macro avg       0.88      0.88      0.88     90368
 weighted avg       0.88      0.88      0.88     90368



In [14]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#Confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[39892  5609]
 [ 5311 39556]]


### Grid search

In [15]:
param_grid = {
    'alpha': [1e-7,10**-6.5,10**-6.25,10**-6.75,1e-6]  # 
}

In [17]:
from sklearn.model_selection import GridSearchCV

grid=GridSearchCV(
    sgdmodel,
    param_grid,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1,
    cv=5
)

In [19]:
grid.fit(x_train_vec,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [20]:
print("Best alpha:", grid.best_params_['alpha'])
print("Best score:", grid.best_score_)

Best alpha: 1e-06
Best score: 0.8785098588841181


### Testing the model on the test set

In [None]:
#Reading the cleaned test set
test_df=pd.read_csv('full_test_cleaned.csv') 

In [22]:
x_test,y_test=test_df['content'],test_df['broad_category']

In [23]:
x_test = x_test.astype(str).fillna("")
x_test_vectorized = vectorizer.transform(x_test)

In [24]:
y_test_pred=sgdmodel.predict(x_test_vectorized)
print("Test set performance")
print(classification_report(y_test,y_test_pred))

Test set performance
               precision    recall  f1-score   support

    Fake News       0.88      0.88      0.88     45665
Reliable News       0.88      0.88      0.88     44703

     accuracy                           0.88     90368
    macro avg       0.88      0.88      0.88     90368
 weighted avg       0.88      0.88      0.88     90368



In [25]:
#Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[40050  5615]
 [ 5372 39331]]


### Testing Liar data

In [None]:
liar_df=pd.read_csv('liar_cleaned.csv') 

In [27]:
x_liar, y_liar = liar_df['Statement'], liar_df['broad_category']

In [28]:
x_liar = x_liar.astype(str).fillna("")
x_liar_vectorized = vectorizer.transform(x_liar)

In [29]:
y_liar_pred=sgdmodel.predict(x_liar_vectorized)
print("Test set performance")
print(classification_report(y_liar,y_liar_pred))

Test set performance
               precision    recall  f1-score   support

    Fake News       0.67      0.45      0.54      6602
Reliable News       0.37      0.60      0.46      3638

     accuracy                           0.50     10240
    macro avg       0.52      0.52      0.50     10240
 weighted avg       0.57      0.50      0.51     10240



In [31]:
#Confusion matrix
cm = confusion_matrix(y_liar, y_liar_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[2989 3613]
 [1473 2165]]
