# The Long Awaited - Fake Tweet Predictor Completor
<br>
Our first step is to load up the necessary libraries. Of course to be able to work with the data and train our model accordingly.

In [35]:
import pandas as pd
import re
import csv
import json
from sklearn.metrics import r2_score
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
import texthero as hero
import textfeatures as tf

Now it is time to load in our data set of Tweets. We will have to clean these Tweets in order to create our features. 

In [36]:
df = pd.read_csv('tweets_labeled.csv')
df.head()

Unnamed: 0,tweet_id,text,label
0,1161040537207463936,'RT @SenJeffMerkley: The Endangered Species Ac...,1
1,1176360756239118342,'RT @LindseyGrahamSC: Interesting concept -- i...,1
2,1099036648573145088,'RT @RealJamesWoods: #BuildTheWall #DeportThem...,0
3,1092915693203480577,'RT @PatriotJackiB: Why would the MEXICAN GOV’...,0
4,1149038450668187654,'RT @TheOnion: Sweden Announces Plan To Get 10...,0


For the cleaning of the data we use a function created by the HU Artifical Intelligence Research Lab. Which will get rid of any of the outlying symbols. The function creates a dictionary with content (text) and a label (number). The created dictionary is used for the features of the model.

In [37]:
for i in df.index:
    txt = df.loc[i]["text"]
    txt = re.sub(r"RT\ \@\w*\:\ ", '', txt) #replace RT-tags
    txt= re.sub(r'@[A-Z0-9a-z_:]+','',txt) #replace username-tags
    txt = re.sub('https?://[A-Za-z0-9./]+','',txt) #replace URLs
    df.at[i,"text"]=txt

df['clean_text'] = hero.clean(df['text'])
df.head()

Unnamed: 0,tweet_id,text,label,clean_text
0,1161040537207463936,'The Endangered Species Act saved the bald eag...,1,endangered species act saved bald eagle signed...
1,1176360756239118342,"'Interesting concept -- impeach first, find fa...",1,interesting concept impeach first find facts l...
2,1099036648573145088,'#BuildTheWall #DeportThemAll ',0,buildthewall deportthemall
3,1092915693203480577,'Why would the MEXICAN GOV’T fund this? Who ar...,0,would mexican gov' fund cahoots
4,1149038450668187654,'Sweden Announces Plan To Get 100% Of Energy F...,0,sweden announces plan get energy unguarded wal...


In [38]:
NUM_TOP_WORDS = 5
df.groupby('label')['clean_text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS])

label         
0      ...        34452
       '          20429
       trump      14965
       ..          9792
       gun         8377
1      ..         34001
       trump      25019
       ...        19779
       ukraine     9178
       '           7607
Name: clean_text, dtype: int64

In the following matrix we can see that on average, the fake news tweets have less characters in the cleaned data than the non-fake news tweets. 

In [39]:
df['character_cnt'] = df['clean_text'].str.len()
df.groupby('label')['character_cnt'].mean()


label
0    59.393172
1    80.022073
Name: character_cnt, dtype: float64

In [40]:
df['character_cnt'] = df['text'].str.len()
df.groupby('label')['character_cnt'].mean()

label
0     77.269448
1    108.331054
Name: character_cnt, dtype: float64

In [41]:
tf.word_count(df,"clean_text","word_count")
df[["clean_text","word_count"]].head()
df.groupby('label')['word_count'].mean()

label
0     8.681654
1    11.226360
Name: word_count, dtype: float64

In [42]:
tf.word_count(df,"text","word_count")
df[["text","word_count"]].head()
df.groupby('label')['word_count'].mean()

label
0    13.000860
1    17.691547
Name: word_count, dtype: float64

In [43]:
tf.avg_word_length(df,"text","avg_word_length")
df.groupby('label')['avg_word_length'].mean()

label
0    5.323626
1    5.418755
Name: avg_word_length, dtype: float64

In [44]:
df_tweet = df[['text', 'clean_text', 'label', 'character_cnt', 'word_count', 'avg_word_length']]
df_tweet.corr()

Unnamed: 0,label,character_cnt,word_count,avg_word_length
label,1.0,0.458499,0.41366,0.023845
character_cnt,0.458499,1.0,0.919203,0.033409
word_count,0.41366,0.919203,1.0,-0.210278
avg_word_length,0.023845,0.033409,-0.210278,1.0


In [31]:
wordcount = {}
n_print = 10
word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(n_print):
    print(word, ": ", count)
    
print(wordcount)

{}


We create two seperate dataframes. One is filtered on the condition of the tweet being fake news, the other for the condition of the tweet being real news. We use these two dataframes to figure out if there is a difference in the top common words.

In [52]:
df_fake = df[df['label'] == 1]
df_real = df[df['label'] == 0]
df_fake.head()


Unnamed: 0,tweet_id,text,label,clean_text,character_cnt,word_count,avg_word_length
0,1161040537207463936,'The Endangered Species Act saved the bald eag...,1,endangered species act saved bald eagle signed...,121,21,4.809524
1,1176360756239118342,"'Interesting concept -- impeach first, find fa...",1,interesting concept impeach first find facts l...,119,20,5.555556
5,1175456815674343424,'Warren: 'Congress is complicit' by failing to...,1,warren congress complicit failing start impeac...,91,12,6.666667
6,1180809117310623744,'A dozen current and former staff from State D...,1,dozen current former staff state department sa...,127,18,6.111111
7,1179840318935576578,'This is a bombshell that isn’t getting much n...,1,bombshell ' getting much notice top diplomat u...,128,24,4.375


In [67]:
real = Counter(" ".join(df_real["clean_text"]).split()).most_common(100)
df_real_common = pd.DataFrame(real, columns =['word', 'count'])
df_real_common.head()

Unnamed: 0,word,count
0,...,34378
1,',20426
2,trump,13882
3,gun,8376
4,illegal,8347


In [68]:
fake = Counter(" ".join(df_fake["clean_text"]).split()).most_common(100)
df_fake_common = pd.DataFrame(fake, columns =['word', 'count'])
df_fake_common.head()

Unnamed: 0,word,count
0,trump,22343
1,...,19514
2,ukraine,9001
3,',7419
4,president,5806


In [70]:
df_real_common['label'] = 1
df_real_common.set_index('label', append=True, inplace=True)

df_fake_common['label'] = 0
df_fake_common.set_index('label', append=True, inplace=True)

In [72]:
merged = df_real_common.append(df_fake_common)
merged_clean = merged.drop_duplicates().sort_index()
merged_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,word,count
Unnamed: 0_level_1,label,label,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,trump,22343
0,1,1,...,34378
1,0,0,...,19514
1,1,1,',20426
2,0,0,ukraine,9001
