In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

#For SVC
from sklearn.svm import SVC

#Used to parse csv file
import csv

#Used for regex (removing dates, usernames, etc)
import re
#Used for removing stop words
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

#Used for stemming
from nltk.stem import *


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aswhe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:

def clean_data(dataset_location):
  #Regex patterns
  date_pattern = r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}\s+\d{4}\b'
  number_space_pattern = re.compile(r'(\d)([^\d\s])')
  punctuation_space_pattern = re.compile(r'([^\w\s])(?!\s)')
  username_pattern = r'\(@\w+\)'

  #nltk
  port_stemmer = PorterStemmer()
  stop_words = stopwords.words('english')

  output = []
  with open(dataset_location, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    #skip first line
    next(csv_reader)
    
    for row in csv_reader:
      line = row[0] + " " + row[1]
      
      #lowercase
      line = line.lower()
      
      #Remove the "photo by Getty Images" stuff
      index = line.rfind('photo by')
      if (index != -1):
        line = line[:index]
        
      #U.S gets turned into 'u s' which then gets filtered by the stop words
      line = line.replace("u.s.","usa")
      
      #Remove usernames (@Username)
      line = re.sub(username_pattern, '', line)
        
      #Remove double spaces
      line = line.replace("  ", " ")
        
      #Add spaces when numbers are right next to text
      line = re.sub(number_space_pattern, r'\1 \2', line)
      
      #Add spaces when puncation is right next to text
      line = re.sub(punctuation_space_pattern, r'\1 ', line)
          
      #Remove special characters and links
      line = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t!])|(\w+:\/\/\S+)|^rt|http.+?", "", line)
      
      #Remove dates
      line = re.sub(date_pattern, '', line)

      #Remove stop words
      line = " ".join([word for word in line.split(" ") if word not in (stop_words)])
      
      #Remove double spaces (again)
      line = line.replace("  ", " ")
      
      '''
      #Stemming
      line_words = line.split(" ")
      stem_output = ""
      for word in line_words:
        word = port_stemmer.stem(word)
        
        stem_output += word + " "
        
      #Remove space at the very end
      line = stem_output[:-1]  
      '''
      
      #Remove double spaces (last time, for good measure)
      line = line.replace("  ", " ")
        
      output.append(line)
      
  csv_file.close()
  return output

In [45]:
fake_data = clean_data("Fake.csv")
true_data = clean_data("True.csv")

In [46]:
with open("fake_clean.txt", 'w') as file:
    for item in fake_data:
        file.write(str(item) + '\n')

file.close()       
        
with open("true_clean.txt", 'w') as file:
    for item in true_data:
        file.write(str(item) + '\n')
        
file.close() 

SVM

In [82]:
svm_fake_data = fake_data[:20000]
svm_true_data = true_data[:20000]

#Creating lables
labels = [1] * len(svm_fake_data) + [0] * len(svm_true_data)

#splitting data into training and testing sets
all_articles = svm_fake_data + svm_true_data
training_articles, testing_articles, training_labels, testing_labels = train_test_split(all_articles, labels, random_state=42, test_size = 0.2)

tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', decode_error = "ignore")
tfidf_vector = tfidf_vectorizer.fit_transform(training_articles)


In [83]:
SVM = SVC(kernel='linear', C=10000)

SVM.fit(tfidf_vector, training_labels)

In [84]:
tdidf_vector_TA = tfidf_vectorizer.transform(testing_articles)

predictions = SVM.predict(tdidf_vector_TA)


confused = confusion_matrix(testing_labels,predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

[[3964   22]
 [  32 3982]]
true positives: 3982
false negatives: 32
true negatives: 3964
false positives: 22


Do tfidf analysis

In [47]:
#Creating lables
#A label of 1 is fake news, a label of 0 is real news

labels = [1] * len(fake_data) + [0] * len(true_data)

#splitting data into training and testing sets
all_articles = fake_data + true_data
training_articles, testing_articles, training_labels, testing_labels = train_test_split(all_articles, labels, random_state=42, test_size = 0.2)

tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', decode_error = "ignore")
tfidf_vector = tfidf_vectorizer.fit_transform(training_articles)


In [57]:
tdidf_vector_TestArt = tfidf_vectorizer.fit_transform(testing_articles)

predictions = SVM.predict(tdidf_vector_TestArt)


confused = confusion_matrix(testing_labels,predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

ValueError: X has 59960 features, but SVC is expecting 108487 features as input.

Classify With RFC

In [None]:
one_word_model = RandomForestClassifier(n_estimators = 50, max_depth = 10)
one_word_model.fit(tfidf_vector, training_labels)
tfidf_vector = tfidf_vectorizer.transform(testing_articles)

predictions = one_word_model.predict(tfidf_vector)

confused = confusion_matrix(testing_labels, predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

[[4100  147]
 [  84 4649]]
true positives: 4649
false negatives: 84
true negatives: 4100
false positives: 147


In [None]:
count_vectorizer = CountVectorizer(input='content', ngram_range=(2, 2), stop_words='english', decode_error = "ignore")
count_vector = count_vectorizer.fit_transform(training_articles)


KeyboardInterrupt: 

In [None]:
count_df= pd.DataFrame(count_vector[0].T.todense(), index=count_vectorizer.get_feature_names_out(), columns = ["Frequency"])
first_25 = count_df.sort_values(by=["Frequency"], ascending=[False]).head(25)

print(first_25)

                         Frequency
said greek                       1
usa president                    1
decision recognize               1
tramples law                     1
usa jerusalem                    1
island cyprus                    1
recognize jerusalem              1
athens talks                     1
laws erdogan                     1
turkey erdogan                   1
reuters turkish                  1
avoiding talks                   1
wanted lasting                   1
donald trump                     1
law athens                       1
lasting solution                 1
jerusalem decision               1
israel trampling                 1
tsipras said                     1
prime minister                   1
erdogan says                     1
trampling international          1
trump unfortunate                1
says usa                         1
turkey wanted                    1


In [None]:
two_word_model = RandomForestClassifier(n_estimators = 50, max_depth = 10)
two_word_model.fit(count_vector, training_labels)
count_vector = count_vectorizer.transform(testing_articles)

predictions = two_word_model.predict(count_vector)

confused = confusion_matrix(testing_labels, predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))

[[1846 2401]
 [  23 4710]]
true positives: 4710
false negatives: 23
true negatives: 1846
false positives: 2401
