<a href="https://colab.research.google.com/github/VishruthRG/MetaLearning_CSCE704_Project/blob/main/CSCE704_FinalNoteBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---

# Learning to choose between classifiers for fake news detection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing required libraries

In [None]:
# Importing libraries to measure performace
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

# Used to vectorize news text
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

# Importing SVC model
from sklearn import svm
from sklearn.svm import SVC

# Importing RFC model
from sklearn.ensemble import RandomForestClassifier

import random
import csv

## Getting and storing data by combining data from 2 datasets

In [None]:
# DATASET 1
real_fake_file_name="/content/drive/Shareddrives/CSCE_704_Project/fake_or_real_news.csv"

# DATASET 2
fake_file_name="/content/drive/Shareddrives/CSCE_704_Project/Fake.csv"
real_file_name="/content/drive/Shareddrives/CSCE_704_Project/True.csv"

# Combined data set
dataset=[]

titles=[] # Holds news article titles
text=[] # Holds news article texts
labels=[] # Holds news article label (real/fake)

with open(real_fake_file_name) as file:
  csvreader = csv.reader(file)
  header = next(csvreader)
  rows = []
  for row in csvreader:
    titles.append(row[1]) #title
    text.append(row[2]) #text
    labels.append(row[3]) #label

with open(fake_file_name) as file:
  csvreader = csv.reader(file)
  header = next(csvreader)
  rows = []
  for row in csvreader:
    titles.append(row[0]) #title
    text.append(row[1]) #text
    labels.append("FAKE") #label

with open(real_file_name) as file:
  csvreader = csv.reader(file)
  header = next(csvreader)
  rows = []
  for row in csvreader:
    titles.append(row[0]) #title
    text.append(row[1]) #text
    labels.append("REAL") #label

dataset.append(titles)
dataset.append(text)
dataset.append(labels)

## Vectorizing the data and making a train test split

In [None]:
# Vectorize all news texts
all_text_vectorizer = TfidfVectorizer(stop_words='english')
all_text_vectorized = all_text_vectorizer.fit_transform(dataset[1]) # vectorizing news text

# 40% train, 60% test split
master_data_train, master_data_test, master_labels_train, master_labels_test = train_test_split(all_text_vectorized, dataset[2], test_size=0.6, random_state = 30)

# Further splitting the 60% into 25000(MLP training) + 5740 (Final/MLP testing)
nn_data_train = master_data_test[:25000] # 25000 MLP training data points
nn_data_test = master_data_test[25000:] # 5740 MLP testing data points

final_labels_train_tf =  master_labels_test[:25000] # 25000 lables used for MLP training
final_labels_test_tf =  master_labels_test[25000:] # last 5740 labels kept seperate for Final/MLP testing

## Fitting a Support Vector Classifer (SVC)

In [None]:
model_svc = SVC(kernel='linear') # linear kernel becuase it gave the best results
model_svc.fit(master_data_train, master_labels_train) # fit the SVC

SVC(kernel='linear')

## Testing the SVC

In [None]:
pred_svc = model_svc.predict(master_data_test) # 30740 predictions
pred_svc_final = pred_svc[25000:] # last 5740 kept aside for final evaluation

## SVC classification report

In [None]:
print("SVC classification report:")
print(classification_report(final_labels_test_tf, pred_svc_final))

SVC classification report:
              precision    recall  f1-score   support

        FAKE       0.95      0.97      0.96      2939
        REAL       0.97      0.95      0.96      2801

    accuracy                           0.96      5740
   macro avg       0.96      0.96      0.96      5740
weighted avg       0.96      0.96      0.96      5740



## Fitting a Random Forest Classifer (RFC)

In [None]:
model_rfc = RandomForestClassifier(n_estimators=200, max_depth=500) # optimal hyperparameters for random forest classifier for this dataset
model_rfc.fit(master_data_train, master_labels_train)

RandomForestClassifier(max_depth=500, n_estimators=200)

## Testing the RFC

In [None]:
pred_rfc = model_rfc.predict(master_data_test) # 30740 predictions
pred_rfc_final = pred_rfc[25000:] # last 5740 kept aside for final evaluation

## RFC classification report

In [None]:
print("RFC classification report:")
print(classification_report(final_labels_test_tf, pred_rfc_final))

RFC classification report:
              precision    recall  f1-score   support

        FAKE       0.94      0.96      0.95      2939
        REAL       0.96      0.93      0.95      2801

    accuracy                           0.95      5740
   macro avg       0.95      0.95      0.95      5740
weighted avg       0.95      0.95      0.95      5740



## Creating a training set for the Multilayer Perceptron Classifier (MLP)

In [None]:
nn_labels_train = []

# RFC is fed in as 0 and SVC as 1
for rfc_pred,svc_pred, true_label in zip(pred_rfc[0:25000],pred_svc[0:25000], final_labels_train_tf):
  if rfc_pred == true_label and svc_pred == true_label: #if RFC and SVC are both correct randomly feed in RFC or SVC to the MLP
    b = random.randint(0, 1)
    nn_labels_train.append(b)
  elif rfc_pred == true_label and svc_pred != true_label: #if only RFC is correct feed in RFC to the MLP
    nn_labels_train.append(0)
  elif rfc_pred != true_label and svc_pred == true_label: #if only SVC is correct feed in SVC to the MLP
    nn_labels_train.append(1)
  else: #if RFC and SVC are both wrong randomly feed in RFC or SVC to the MLP
    b = random.randint(0, 1)
    nn_labels_train.append(b)

## Training the MLP

In [None]:
from sklearn.neural_network import MLPClassifier
model_nn1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 25, 25), random_state=1, max_iter=1000)
model_nn1.fit(nn_data_train, nn_labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(100, 25, 25), max_iter=1000,
              random_state=1, solver='lbfgs')

## Testing the MLP

In [None]:
nn_preds = model_nn1.predict(nn_data_test) # Holds RFC/SVC in binary format (0 -> RFC; 1-> SVC)

## Getting final true/false predictions from the MLP

In [None]:
final_preds= [] # Holds REAL/FAKE translated from RFC/SVC
#for each final test data point see which classifier the MLP predicted and record the real/fake prediction that classifer made
for each_nn_prediction, final_test_data in zip(nn_preds,nn_data_test):
  if each_nn_prediction == 0:
    pred = model_rfc.predict(final_test_data)
    final_preds.append(pred)
  else:
    pred = model_svc.predict(final_test_data)
    final_preds.append(pred)

## MLP classification report

In [None]:
print("MLP classification report:")
print(classification_report(final_labels_test_tf,final_preds))

MLP classification report:
              precision    recall  f1-score   support

        FAKE       0.95      0.97      0.96      2939
        REAL       0.97      0.94      0.96      2801

    accuracy                           0.96      5740
   macro avg       0.96      0.96      0.96      5740
weighted avg       0.96      0.96      0.96      5740



## Evaluate final performance

In [None]:
nn_predicts_rfc = 0
nn_predicts_svc = 0

svc_correct_when_svc=0
svc_wrong_when_svc=0
rfc_correct_when_svc=0
rfc_wrong_when_svc=0

svc_correct_when_rfc=0
svc_wrong_when_rfc=0
rfc_correct_when_rfc=0
rfc_wrong_when_rfc=0

for each_nn_pred, i in zip(nn_preds, range(len(final_labels_test_tf))):
  if each_nn_pred == 0: ## MLP predicted RFC
    nn_predicts_rfc += 1
    if pred_rfc_final[i] == final_labels_test_tf[i]:
      rfc_correct_when_rfc += 1
    else:
      rfc_wrong_when_rfc +=1
    
    if pred_svc_final[i] == final_labels_test_tf[i]:
      svc_correct_when_rfc+=1
    else:
      svc_wrong_when_rfc +=1

  else: # MLP predicted SVC
    nn_predicts_svc += 1
    if pred_svc_final[i] == final_labels_test_tf[i]:
      svc_correct_when_svc+=1
    else:
      svc_wrong_when_svc +=1

    if pred_rfc_final[i] == final_labels_test_tf[i]:
      rfc_correct_when_svc += 1
    else:
      rfc_wrong_when_svc +=1

# Printing final results of neural network

print("Neural network picked SVC:", nn_predicts_svc, "times")
print("In these",nn_predicts_svc,"times that the neural netowork chose to use SVC:")
print("SVC was correct", svc_correct_when_svc, "times")
print("SVC was wrong", svc_wrong_when_svc, "times")
print("RFC was correct", rfc_correct_when_svc, "times")
print("RFC was wrong", rfc_wrong_when_svc, "times")
print()
print("Neural network picked RFC:", nn_predicts_rfc, "times")
print("In these",nn_predicts_rfc,"times that the neural netowork chose to use RFC:")
print("RFC was correct", rfc_correct_when_rfc, "times")
print("RFC was wrong", rfc_wrong_when_rfc, "times")
print("SVC was correct", svc_correct_when_rfc, "times")
print("SVC was wrong", svc_wrong_when_rfc, "times")
print()

disagree = 0
correct_picks_by_nn = 0
wrong_picks_by_nn = 0

for i in range(5740):
  if pred_rfc_final[i] != pred_svc_final[i]:
    disagree += 1
    nnPick = nn_preds[i]
    if nnPick == 0 and pred_rfc_final[i] == final_labels_test_tf[i]:
      correct_picks_by_nn += 1
    elif nnPick == 1 and pred_svc_final[i] == final_labels_test_tf[i]:
      correct_picks_by_nn += 1
    else:
      wrong_picks_by_nn += 1

print("RFC and SVC disagreed on classification ", disagree, " times")
print("Neural Network picked the correct classifier ", correct_picks_by_nn, " times in case of disagreement")
print("Neural Network picked the wrong classifier ", wrong_picks_by_nn, " times in case of disagreement")

Neural network picked SVC: 2933 times
In these 2933 times that the neural netowork chose to use SVC:
SVC was correct 2810 times
SVC was wrong 123 times
RFC was correct 2749 times
RFC was wrong 184 times

Neural network picked RFC: 2807 times
In these 2807 times that the neural netowork chose to use RFC:
RFC was correct 2692 times
RFC was wrong 115 times
SVC was correct 2703 times
SVC was wrong 104 times

RFC and SVC disagreed on classification  226  times
Neural Network picked the correct classifier  138  times in case of disagreement
Neural Network picked the wrong classifier  88  times in case of disagreement
