In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

#For SVC
from sklearn.svm import SVC

#Used to parse csv file
import csv

#Used for regex (removing dates, usernames, etc)
import re
#Used for removing stop words
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

#Used for stemming
from nltk.stem import *


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sgull\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

def clean_data(dataset_location):
  #Regex patterns
  date_pattern = r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}\s+\d{4}\b'
  number_space_pattern = re.compile(r'(\d)([^\d\s])')
  punctuation_space_pattern = re.compile(r'([^\w\s])(?!\s)')
  username_pattern = r'\(@\w+\)'

  #nltk
  port_stemmer = PorterStemmer()
  stop_words = stopwords.words('english')

  output = []
  with open(dataset_location, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    #skip first line
    next(csv_reader)
    
    for row in csv_reader:
      line = row[0] + " " + row[1]
      
      #lowercase
      line = line.lower()
      
      #Remove the "photo by Getty Images" stuff
      index = line.rfind('photo by')
      if (index != -1):
        line = line[:index]
        
      #U.S gets turned into 'u s' which then gets filtered by the stop words
      line = line.replace("u.s.","usa")
      
      #Remove usernames (@Username)
      line = re.sub(username_pattern, '', line)
        
      #Remove double spaces
      line = line.replace("  ", " ")
        
      #Add spaces when numbers are right next to text
      line = re.sub(number_space_pattern, r'\1 \2', line)
      
      #Add spaces when puncation is right next to text
      line = re.sub(punctuation_space_pattern, r'\1 ', line)
          
      #Remove special characters and links
      line = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t!])|(\w+:\/\/\S+)|^rt|http.+?", "", line)
      
      #Remove dates
      line = re.sub(date_pattern, '', line)

      #Remove stop words
      line = " ".join([word for word in line.split(" ") if word not in (stop_words)])
      
      #Remove double spaces (again)
      line = line.replace("  ", " ")
      
      
      line_words = line.split(" ")
      stem_output = ""
      for word in line_words:
        word = port_stemmer.stem(word)
        
        stem_output += word + " "
        
      #Remove space at the very end
      line = stem_output[:-1]  
      
      
      #Remove double spaces (last time, for good measure)
      line = line.replace("  ", " ")
        
      output.append(line)
      
  csv_file.close()
  return output

In [3]:
fake_data = clean_data("Fake.csv")
true_data = clean_data("True.csv")

In [4]:
with open("fake_clean.txt", 'w') as file:
    for item in fake_data:
        file.write(str(item) + '\n')

file.close()       
        
with open("true_clean.txt", 'w') as file:
    for item in true_data:
        file.write(str(item) + '\n')
        
file.close() 

SVM

In [5]:
svm_fake_data = fake_data[:20000]
svm_true_data = true_data[:20000]

#Creating lables
labels = [1] * len(svm_fake_data) + [0] * len(svm_true_data)

#splitting data into training and testing sets
all_articles = svm_fake_data + svm_true_data
training_articles, testing_articles, training_labels, testing_labels = train_test_split(all_articles, labels, random_state=42, test_size = 0.2)

tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', decode_error = "ignore")
tfidf_vector = tfidf_vectorizer.fit_transform(training_articles)


In [6]:
SVM = SVC(kernel='linear', C=10000)

SVM.fit(tfidf_vector, training_labels)

SVC(C=10000, kernel='linear')

In [7]:
tdidf_vector_TA = tfidf_vectorizer.transform(testing_articles)

predictions = SVM.predict(tdidf_vector_TA)


confused = confusion_matrix(testing_labels,predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

class_report_lin = classification_report(testing_labels, predictions, digits = 4)
print()
print(class_report_lin)

[[3960   26]
 [  32 3982]]
true positives: 3982
false negatives: 32
true negatives: 3960
false positives: 26

              precision    recall  f1-score   support

           0     0.9920    0.9935    0.9927      3986
           1     0.9935    0.9920    0.9928      4014

    accuracy                         0.9928      8000
   macro avg     0.9927    0.9928    0.9927      8000
weighted avg     0.9928    0.9928    0.9928      8000



Tfidf analysis

In [9]:
#Creating lables
#A label of 1 is fake news, a label of 0 is real news

labels = [1] * len(fake_data) + [0] * len(true_data)

#splitting data into training and testing sets
all_articles = fake_data + true_data
training_articles, testing_articles, training_labels, testing_labels = train_test_split(all_articles, labels, random_state=42, test_size = 0.2)

tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', decode_error = "ignore")
tfidf_vector = tfidf_vectorizer.fit_transform(training_articles)



Classify With RFC

In [10]:
one_word_model = RandomForestClassifier(n_estimators = 50, max_depth = 10)
one_word_model.fit(tfidf_vector, training_labels)
tfidf_vector = tfidf_vectorizer.transform(testing_articles)

predictions = one_word_model.predict(tfidf_vector)

confused = confusion_matrix(testing_labels, predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

class_report_lin = classification_report(testing_labels, predictions, digits = 4)
print()
print(class_report_lin)

[[4025  222]
 [ 145 4588]]
true positives: 4588
false negatives: 145
true negatives: 4025
false positives: 222

              precision    recall  f1-score   support

           0     0.9652    0.9477    0.9564      4247
           1     0.9538    0.9694    0.9615      4733

    accuracy                         0.9591      8980
   macro avg     0.9595    0.9585    0.9590      8980
weighted avg     0.9592    0.9591    0.9591      8980



n-gram approach

In [11]:
count_vectorizer = CountVectorizer(input='content', ngram_range=(2, 2), stop_words='english', decode_error = "ignore")
count_vector = count_vectorizer.fit_transform(training_articles)


print out some features to check that correctly sized n-grams were produced

In [None]:
count_df= pd.DataFrame(count_vector[0].T.todense(), index=count_vectorizer.get_feature_names_out(), columns = ["Frequency"])
first_25 = count_df.sort_values(by=["Frequency"], ascending=[False]).head(25)

print(first_25)

classify with RFC

In [12]:
multi_word_model = RandomForestClassifier(n_estimators = 50, max_depth = 10)
multi_word_model.fit(count_vector, training_labels)
count_vector = count_vectorizer.transform(testing_articles)

predictions = multi_word_model.predict(count_vector)

confused = confusion_matrix(testing_labels, predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

class_report_lin = classification_report(testing_labels, predictions, digits = 4)
print()
print(class_report_lin)

[[2145 2102]
 [  32 4701]]
true positives: 4701
false negatives: 32
true negatives: 2145
false positives: 2102

              precision    recall  f1-score   support

           0     0.9853    0.5051    0.6678      4247
           1     0.6910    0.9932    0.8150      4733

    accuracy                         0.7624      8980
   macro avg     0.8382    0.7492    0.7414      8980
weighted avg     0.8302    0.7624    0.7454      8980

