# Week 5 Part 2 Assignment
### DATA 620

David Moste ~ Euclid zhang ~ Samuel Reeves
***
### Problem Description

Can we make a model with Python's Natural Language Toolkit that accurately categorizes real and fake news?

In [1]:
import requests
import tarfile
import io

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

from bs4 import BeautifulSoup

### Data Description & Preprocessing

Data source: https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

This set contains just under 50k news items, about 55:45 fake:real.  We have downloaded the set and put in the github repository with lzma encryption so that it can be accessed simply from any machine.

The important steps to transform this human-readable news into clean data are the following:

1. Downloading and decompressing the csv data
2. tokenizing the text bounded by whitespace
3. removing numbers and special characters (except the period used for abbreviations)
4. removing hyperlinks and artifacts of html
5. removing stop words
6. changing contractions to their long form (eg. he's --> he is)
7. The string "Images." at the end of the text, used to show that there are attached images
8. stemming and lemmatizing

In [12]:
fake_url = "https://github.com/ezaccountz/Data_620/raw/main/week5p2/Fake.zip"
real_url = "https://github.com/ezaccountz/Data_620/raw/main/week5p2/True.zip"
fake = requests.get(fake_url).content
real = requests.get(real_url).content

In [None]:
fake_news = pd.read_csv(r"Fake.csv", header=0, index_col=False)
real_news = pd.read_csv(r"True.csv", header=0, index_col=False)

Add the column variable indicating the news is fake or not

In [None]:
fake_news['fake'] = True
real_news['fake'] = False

In [None]:
fake_news.head()

Number of fake news and number of real news:

In [None]:
len(fake_news), len(real_news)

There are too many news items! we will select a small subset for our analysis.

For example, a set of news with category 'politices' and with Trump mentioned in the news

In [None]:
fake_news2 = fake_news.loc[fake_news['subject'] == 'politics']
real_news2 = real_news.loc[fake_news['subject'] == 'politics']

In [None]:
fake_news2 = fake_news2.loc[[bool(re.search('[t|T]rump',news)) for news in fake_news2['text']]]
real_news2 = real_news2.loc[[bool(re.search('[t|T]rump',news)) for news in real_news2['text']]]

In [None]:
len(fake_news2), len(real_news2)

Combine the fake news and real news into one data frame

In [None]:
news = fake_news2[['text','fake']].append(real_news2[['text','fake']], ignore_index = True)

#complete list of all news
#news = fake_news[['text','real_news']].append(real_news[['text','real_news']], ignore_index = True)

Let's check the first news and see what we should do to clean up the text

In [None]:
fake_news['text'][0]

Create stemmer and lemmatizer. Generate a list of stop words

In [None]:
porter = nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
nltk.download('stopwords')
stop_words = stopwords.words('english')
#add extra stop words that is not included in the stop words list
#'' is used to remove empty word
stop_words = stop_words + ['could', 'should','would','']
#keep the word 'not' in the text since negation may have meanings here
stop_words.remove('not')

Set up a function to perform text cleaning as described above

In [None]:
def clean_text(text):
    
    #convert to lower case
    text = str(text).lower()   
    
    #remove hyperlinks
    text = re.sub(r'[^\s]+\.com.[^\s]+','',text)
    text = re.sub(r'http[^\s]+','',text)
    
    #clean the html markups
    text = BeautifulSoup(text).get_text()
    
    #A lot of the news have the word 'images' at the end to represent that 
    #there are images attached to the news. We will remove such words
    text = re.sub(r'Images\.$', '', text)
    
    #remove special characters except '.', since it can be used in abbreviations (F.B.I. for example)
    text = re.sub(r'[^A-Za-z\s\.]+', ' ', text)  
    #remove '.' that is not used in abbreviations
    text = re.sub(r'([A-Za-z]{2,})\.', r'\1 ', text)
    
    
    #replace multiple spaces by 1 space
    text = re.sub(r'\s{2,}', ' ', text)
       
    #in the text, the ' character is omiited. Therefore, string such as he's is stored as he s
    #the following codes covert such string to its complete form. For example, he'll is coverted into he will
    text = re.sub(r'(he|she|it|this|that) (s )', r'\1 is', text)
    text = re.sub(r'(they|we) (re)', r'\1 are', text)
    text = re.sub(' ve ', ' have ', text)
    text = re.sub(' ll ', ' will ', text)
    text = re.sub('won t ', 'will not ', text)
    text = re.sub('n t ', ' not ', text)
    
    #split the text into words and filter out stop words
    text = [word for word in text.split(' ') if word not in stop_words]
    #text stemming
    text = [porter.stem(word) for word in text]
    #text lemmatizing
    text = [wnl.lemmatize(word) for word in text]
   
    #convert the words back to one string
    text = " ".join(text)
    #remove spaces from start and end of string
    text = text.strip()
    
    return text

Clean up all texts

In [None]:
nltk.download('wordnet')
news['cleaned_text'] = news['text'].apply(clean_text)

Check if there are any documents with empty content after text clean up

In [None]:
news.loc[[len(news) == 0 for news in news['cleaned_text']]]['text']

Documents with only a hyper link as content will become an empty string after clean up

Remove documents with empty content after text clean up

In [None]:
news = news.loc[[len(news) != 0 for news in news['cleaned_text']]]
news.reset_index(drop=True, inplace = True)

In [None]:
news

In [None]:
#news.to_csv(r"E:\SPS\DATA 620\assignments\data\fake and real news\news.csv", index = False)

In [None]:
#news = pd.read_csv(r"E:\SPS\DATA 620\assignments\data\fake and real news\news.csv", header=0, index_col=False)

### Analysis

Use TfidfVectorizer to calculate the Term Frequency — Inverse Document Frequency of unigram tokens

In [None]:
Tf_Idf_Vectorizer = TfidfVectorizer(ngram_range=(1,1))

In [None]:
X = Tf_Idf_Vectorizer.fit_transform(news['cleaned_text']).toarray()
Y = news['fake']

In [None]:
df_tfidfvect = pd.DataFrame(data = X,columns = Tf_Idf_Vectorizer.get_feature_names())

In [None]:
df_tfidfvect

Separate the TF-IDF data frame into one with fake news and one with real news

In [None]:
df_tfidfvect_real = df_tfidfvect.loc[news['fake'] == False]

In [None]:
df_tfidfvect_fake = df_tfidfvect.loc[news['fake']]

The most important key words mentioned in real news (excluding Trump since we selected our subset by the key word 'Trump') are:

In [None]:
most_important_real = df_tfidfvect_real.sum().sort_values(ascending = False).T.drop('trump')
most_important_real[:20]

In [None]:
import wordcloud
from wordcloud import WordCloud
Cloud = WordCloud(width=600, height=400, background_color="white", max_words=50).generate_from_frequencies(most_important_real)
plt.figure(figsize = (15,10))
plt.imshow(Cloud, interpolation ='bilinear')
plt.axis('off')
plt.show()

The most important key words mentioned in fake news (excluding Trump since we selected our subset by the key word 'Trump') are:

In [None]:
most_important_fake = df_tfidfvect_fake.sum().sort_values(ascending = False).T.drop('trump')
most_important_fake[:20]

In [None]:
import wordcloud
from wordcloud import WordCloud
Cloud = WordCloud(width=600, height=400, background_color="black", max_words=50).generate_from_frequencies(most_important_fake)
plt.figure(figsize = (15,10))
plt.imshow(Cloud, interpolation ='bilinear')
plt.axis('off')
plt.show()

It's interesting to see that a lot of key words about real news are about republicans and a lot of key words about fake news are democrats. The politics news are really politics.

Now let's build our model using Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()

We will perform cross validation and generate a list of performance scores

In [None]:
model_scores = pd.DataFrame(columns = ['accurary','precision','recall','F1',
                                       'True Positive','False Positive','True Negative','False Negative'])
kf = KFold(n_splits=5,random_state=620,shuffle=True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    mnb.fit(X_train, Y_train)
    y_pred = mnb.predict(X_test)
    accuracy = metrics.accuracy_score(Y_test, y_pred)
    precision = metrics.precision_score(Y_test, y_pred)
    recall = metrics.recall_score(Y_test, y_pred)
    F1 = metrics.f1_score(Y_test, y_pred)
    cm = metrics.confusion_matrix(Y_test, y_pred)
    cm = cm/cm.astype(np.float).sum(axis=0)
    model_scores.loc[len(model_scores)] = [accuracy,precision,recall,F1,cm[1, 1],cm[0, 1],cm[0, 0],cm[1, 0]]
    
model_scores.loc['Average'] = model_scores.mean()

model_scores

The scores are indicating that our model is performing well. The accurary is around 90 percent. We have correctly identified 87% of the fake news and 96% of the real news