In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Used to parse csv file
import csv

#Used for regex (removing dates, usernames, etc)
import re
#Used for removing stop words
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

#Used for stemming
from nltk.stem import *


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aswhe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:

def clean_data(dataset_location):
  #Regex patterns
  date_pattern = r'\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}\s+\d{4}\b'
  number_space_pattern = re.compile(r'(\d)([^\d\s])')
  punctuation_space_pattern = re.compile(r'([^\w\s])(?!\s)')
  username_pattern = r'\(@\w+\)'

  #nltk
  port_stemmer = PorterStemmer()
  stop_words = stopwords.words('english')

  output = []
  with open(dataset_location, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    #skip first line
    next(csv_reader)
    
    for row in csv_reader:
      line = row[0] + " " + row[1]
      
      #lowercase
      line = line.lower()
      
      #Remove the "photo by Getty Images" stuff
      index = line.rfind('photo by')
      if (index != -1):
        line = line[:index]
        
      #U.S gets turned into 'u s' which then gets filtered by the stop words
      line = line.replace("u.s.","usa")
      
          
          
      #Remove usernames (@Username)
      line = re.sub(username_pattern, '', line)
        
      #Remove double spaces
      line = line.replace("  ", " ")
        
      #Add spaces when numbers are right next to text
      line = re.sub(number_space_pattern, r'\1 \2', line)
      
      #Add spaces when puncation is right next to text
      line = re.sub(punctuation_space_pattern, r'\1 ', line)
          
      #Remove special characters and links
      line = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t!'])|(\w+:\/\/\S+)|^rt|http.+?", "", line)
      
      #Remove dates
      line = re.sub(date_pattern, '', line)

      #Remove stop words
      line = " ".join([word for word in line.split(" ") if word not in (stop_words)])
      
      #Remove double spaces (again)
      line = line.replace("  ", " ")
      
      #Stemming
      line_words = line.split(" ")
      stem_output = ""
      for word in line_words:
        word = port_stemmer.stem(word)
        
        stem_output += word + " "
        
      #Remove space at the very end
      line = stem_output[:-1]  
      
      #Remove double spaces (last time, for good measure)
      line = line.replace("  ", " ")
        
      output.append(line)
      
  csv_file.close()
  return output

In [19]:
fake_data = clean_data("Fake.csv")
true_data = clean_data("True.csv")

In [20]:

with open("fake_clean.txt", 'w') as file:
    for item in fake_data:
        file.write(str(item) + '\n')

file.close()       
        
with open("true_clean.txt", 'w') as file:
    for item in true_data:
        file.write(str(item) + '\n')
        
file.close() 

Do tfidf analysis

In [21]:
#Things that need to be defined
training_articles = []
testing_articles = []
training_labels = []
testing_labels = []

tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', decode_error = "ignore")
tfidf_vector = tfidf_vectorizer.fit_transform(training_articles)




ValueError: empty vocabulary; perhaps the documents only contain stop words

Classify With RFC

In [None]:
model = RandomForestClassifier(n_estimators = 10, max_depth = 10)
model.fit(tfidf_vector, training_labels)
tfidf_vector = tfidf_vectorizer.transform(testing_articles)

predictions = model.predict(tfidf_vector)

confused = confusion_matrix(testing_labels, predictions)
print(confused)
print("true positives: " + str(confused[1][1]))
print("false negatives: " + str(confused[1][0]))
print("true negatives: "  +str(confused[0][0]))
print("false positives: " + str(confused[0][1]))