### *CS579 ONLINE SOCIAL NETWORK ANALYSIS* 
*FAKE NEWS CLASSIFICATION*

By:  
*   Mohan Babu Kunchala (A20524765)
*   Sanjitha Reddy Pathuri (A20524383) 


In [1]:
#Import the necessary libraries to build a classifier: pandas, numpy, sklearn, and nltk
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import string
import nltk 
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the training data
train_data = pd.read_csv('train.csv')

In [3]:
#Remove any duplicate rows, missing values, or irrelevant columns:
train_data.drop_duplicates(inplace=True) 
train_data.dropna(inplace=True) 
train_data.drop(columns=['id', 'tid1', 'tid2'], inplace=True)

In [4]:
# Defining a function to clean the text: Convert the text data into lowercase, remove any punctuations, and apply stemming
stopwords = set(nltk.corpus.stopwords.words('english')) 
def clean_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text) 
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords] 
    text = ' '.join(tokens) 
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    return text

In [5]:
# Preprocess the data by applying the clean_text function n each pair f texts
train_data['title1_en_clean'] = train_data['title1_en'].apply(clean_text)
train_data['title2_en_clean'] = train_data['title2_en'].apply(clean_text)

In [6]:
# Vectorize the text using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X1 = tfidf_vectorizer.fit_transform(train_data['title1_en_clean'])
X2 = tfidf_vectorizer.transform(train_data['title2_en_clean'])
X = sp.hstack((X1, X2))
y = train_data['label']

In [8]:
# Split the preprocessed data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train different classifiers and evaluate accuracy of each classifier on the validation set

In [10]:
### Logical Regression

# Train the classifier
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
lr_classifier.fit(X_train, y_train)


In [11]:
# Evaluate the model
y_pred_lr = lr_classifier.predict(X_val)
accuracy_lr = accuracy_score(y_val, y_pred_lr)
precision_lr = precision_score(y_val, y_pred_lr, average='weighted')
recall_lr = recall_score(y_val, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_val, y_pred_lr, average='weighted')
print('## Logical Regression ##')
print('Accuracy:', accuracy_lr)
print('Precision:', precision_lr)
print('Recall:', recall_lr)
print('F-1 Score:', f1_score_lr)
print('Confusion Matrix:')
print(confusion_matrix(y_val,y_pred_lr))
print('Classification Report:')
print(classification_report(y_val,y_pred_lr))


## Logical Regression ##
Accuracy: 0.8051823977851001
Precision: 0.8006534791283743
Recall: 0.8051823977851001
F-1 Score: 0.797999093990401
Confusion Matrix:
[[ 9394     2  5417]
 [   26   367   928]
 [ 3526    93 31536]]
Classification Report:
              precision    recall  f1-score   support

      agreed       0.73      0.63      0.68     14813
   disagreed       0.79      0.28      0.41      1321
   unrelated       0.83      0.90      0.86     35155

    accuracy                           0.81     51289
   macro avg       0.78      0.60      0.65     51289
weighted avg       0.80      0.81      0.80     51289



In [12]:
### Naive Bayes Classifier

# Train the classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)


In [13]:
# Evaluate the model
y_pred_nb = nb_classifier.predict(X_val)
accuracy_nb = accuracy_score(y_val, y_pred_nb)
precision_nb = precision_score(y_val, y_pred_nb, average='weighted')
recall_nb = recall_score(y_val, y_pred_nb, average='weighted')
f1_score_nb = f1_score(y_val, y_pred_nb, average='weighted')
print('## Naive Bayes classifier ##')
print('Accuracy:', accuracy_nb)
print('Precision:', precision_nb)
print('Recall:', recall_nb)
print('F-1 Score:', f1_score_nb)
print('Confusion Matrix:')
print(confusion_matrix(y_val,y_pred_nb))
print('Classification Report:')
print(classification_report(y_val,y_pred_nb))

## Naive Bayes classifier ##
Accuracy: 0.7679034490826493
Precision: 0.7602843815177308
Recall: 0.7679034490826493
F-1 Score: 0.7581086317566167
Confusion Matrix:
[[ 8661    27  6125]
 [   12   174  1135]
 [ 4546    59 30550]]
Classification Report:
              precision    recall  f1-score   support

      agreed       0.66      0.58      0.62     14813
   disagreed       0.67      0.13      0.22      1321
   unrelated       0.81      0.87      0.84     35155

    accuracy                           0.77     51289
   macro avg       0.71      0.53      0.56     51289
weighted avg       0.76      0.77      0.76     51289



In [14]:
### Random forest

# Train the classifier
rf_classifier = RandomForestClassifier(n_estimators=25, random_state=42)
rf_classifier.fit(X_train, y_train)


In [15]:
# Evaluate the model
y_pred_rf = rf_classifier.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_pred_rf)
precision_rf = precision_score(y_val, y_pred_rf, average='weighted')
recall_rf = recall_score(y_val, y_pred_rf, average='weighted')
f1_score_rf = f1_score(y_val, y_pred_rf, average='weighted')
print('## Random Forest ##')
print('Accuracy:', accuracy_rf)
print('Precision:', precision_rf)
print('Recall:', recall_rf)
print('F-1 Score:', f1_score_rf)
print('Confusion Matrix:')
print(confusion_matrix(y_val,y_pred_rf))
print('Classification Report:')
print(classification_report(y_val,y_pred_rf))

## Random Forest ##
Accuracy: 0.8484275380685917
Precision: 0.8459677925732622
Recall: 0.8484275380685917
F-1 Score: 0.8415312044227989
Confusion Matrix:
[[10140    12  4661]
 [   10   431   880]
 [ 2107   104 32944]]
Classification Report:
              precision    recall  f1-score   support

      agreed       0.83      0.68      0.75     14813
   disagreed       0.79      0.33      0.46      1321
   unrelated       0.86      0.94      0.89     35155

    accuracy                           0.85     51289
   macro avg       0.82      0.65      0.70     51289
weighted avg       0.85      0.85      0.84     51289



In [16]:
### Linear Support Vector Classifier (SVC)

# Train the classifier
svc_classifier = LinearSVC(random_state=42)
svc_classifier.fit(X_train, y_train)

In [18]:
# Evaluate the model
y_pred_svc = svc_classifier.predict(X_val)
accuracy_svc = accuracy_score(y_val, y_pred_svc)
precision_svc = precision_score(y_val, y_pred_svc, average='weighted')
recall_svc = recall_score(y_val, y_pred_svc, average='weighted')
f1_score_svc = f1_score(y_val, y_pred_svc, average='weighted')
print('## Support Vector classifier ##')
print('Accuracy:', accuracy_svc)
print('Precision:', precision_svc)
print('Recall:', recall_svc)
print('F-1 Score:', f1_score_svc)
print('Confusion Matrix:')
print(confusion_matrix(y_val,y_pred_svc))
print('Classification Report:')
print(classification_report(y_val,y_pred_svc))

## Support Vector classifier ##
Accuracy: 0.8098227690147985
Precision: 0.8055554635157197
Recall: 0.8098227690147985
F-1 Score: 0.8053846585835107
Confusion Matrix:
[[ 9962    11  4840]
 [   29   438   854]
 [ 3850   170 31135]]
Classification Report:
              precision    recall  f1-score   support

      agreed       0.72      0.67      0.70     14813
   disagreed       0.71      0.33      0.45      1321
   unrelated       0.85      0.89      0.87     35155

    accuracy                           0.81     51289
   macro avg       0.76      0.63      0.67     51289
weighted avg       0.81      0.81      0.81     51289



In [19]:
### Stochastic Gradient Descent (SGD) Classifier

# Train the classifier
sgd_classifier = SGDClassifier(random_state=42, max_iter=1000)
sgd_classifier.fit(X_train, y_train)


In [20]:
# Evaluate the model
y_pred_sgd = sgd_classifier.predict(X_val)
accuracy_sgd = accuracy_score(y_val, y_pred_sgd)
precision_sgd = precision_score(y_val, y_pred_sgd, average='weighted')
recall_sgd = recall_score(y_val, y_pred_sgd, average='weighted')
f1_score_sgd = f1_score(y_val, y_pred_sgd, average='weighted')
print('## SGD classifier ##')
print('Accuracy:', accuracy_sgd)
print('Precision:', precision_sgd)
print('Recall:', recall_sgd)
print('F-1 Score:', f1_score_sgd)
print('Confusion Matrix:')
print(confusion_matrix(y_val,y_pred_sgd))
print('Classification Report:')
print(classification_report(y_val,y_pred_sgd))

## SGD classifier ##
Accuracy: 0.7571604047651543
Precision: 0.7715733532607736
Recall: 0.7571604047651543
F-1 Score: 0.713641545190657
Confusion Matrix:
[[ 4665     6 10142]
 [    7    77  1237]
 [ 1058     5 34092]]
Classification Report:
              precision    recall  f1-score   support

      agreed       0.81      0.31      0.45     14813
   disagreed       0.88      0.06      0.11      1321
   unrelated       0.75      0.97      0.85     35155

    accuracy                           0.76     51289
   macro avg       0.81      0.45      0.47     51289
weighted avg       0.77      0.76      0.71     51289



In [21]:
# Loads test data, preprocess it using the same steps as train data
test_data = pd.read_csv('test.csv')
test_data.drop(columns=['id', 'tid1', 'tid2'], inplace=True)
test_data['title1_en_clean'] = test_data['title1_en'].apply(clean_text)
test_data['title2_en_clean'] = test_data['title2_en'].apply(clean_text)

X1_test = tfidf_vectorizer.transform(test_data['title1_en_clean'])
X2_test = tfidf_vectorizer.transform(test_data['title2_en_clean'])
X_test = sp.hstack((X1_test, X2_test))

In [27]:
# Make predictions on the test data using each classifier
y_pred_test_lr = lr_classifier.predict(X_test)
y_pred_test_nb = nb_classifier.predict(X_test)
y_pred_test_rf = rf_classifier.predict(X_test)
y_pred_test_svc = svc_classifier.predict(X_test)
y_pred_test_sgd = sgd_classifier.predict(X_test)

In [28]:
# Create a submission file for each classifier by saving the predicted labels for test data in the same format as sample_submission file
submission = pd.read_csv('sample_submission.csv')

submission['label'] = y_pred_test_lr
submission.to_csv('submission_lf.csv', index=False)

submission['label'] = y_pred_test_nb
submission.to_csv('submission_nb.csv', index=False)

submission['label'] = y_pred_test_svc
submission.to_csv('submission_svc.csv', index=False)

submission['label'] = y_pred_test_sgd
submission.to_csv('submission_sgd.csv', index=False)

#Saving the Random forest classifier as the actual submission file as the accuracy is more than the other classifiers used
submission['label'] = y_pred_test_rf
submission.to_csv('submission.csv', index=False)