# Data mining Assignment 4 - Troll Tweet prediction

In [1]:
# Importing libraries
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
%matplotlib inline

import string
import itertools
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

# Evaluation metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn import metrics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 

# Machine Learning libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


from sklearn.manifold import TSNE
from sklearn.preprocessing import scale

import gensim
from gensim.models.word2vec import Word2Vec 
from tqdm import tqdm

In [2]:
import warnings as _warnings

with _warnings.catch_warnings():
    _warnings.simplefilter("ignore")
    # joblib imports may raise DeprecationWarning on certain Python
    # versions
    import joblib
    from joblib import logger
    from joblib import dump, load
    from joblib import __version__
    from joblib import effective_n_jobs
    from joblib import hash
    from joblib import cpu_count, Parallel, Memory, delayed
    from joblib import parallel_backend, register_parallel_backend


    __all__ = ["parallel_backend", "register_parallel_backend", "cpu_count",
               "Parallel", "Memory", "delayed", "effective_n_jobs", "hash",
               "logger", "dump", "load", "joblib", "__version__"]

## Loading the sample dataset of tweets

In [3]:
df = pd.read_csv('Assignment_4_data/IRAhandle_tweets_sample_data.csv', sep=',' , encoding='latin-1')

# Use latin-1 as encoding since it was throwing 'UnicodeDecodeError, invalid continuation byte'
df.head()

Unnamed: 0,content,account_category,troll
0,No wonder NFL players are kneeling to push the...,RightTroll,1
1,realDonaldTrump Don t worry the silent majo...,RightTroll,1
2,Roni K Patriot Happy to be here,RightTroll,1
3,Merkel si prepara a incontrare Trump anche ...,NonEnglish,0
4,Salute ecco la nuova lista delle cure gratui...,NonEnglish,0


In [4]:
df.content.unique()

array(['No wonder NFL players are kneeling to push the false narrative of  evil police   They re all criminals themselves  https   t co iu7lJN3ccm',
       '  realDonaldTrump Don t worry  the silent majority who elected you are behind you 100   Keep up the good fight  ',
       '  Roni K Patriot Happy to be here ', ...,
       'Police are now at a second location near a restaurant investigating a possible bomb    Elizabeth',
       'So Trump totally plans on losing this thing I guess if he s already sizing up the field for 2020    https   t co Mn4ZmtLinE',
       '  Disciple4Lif  Why I study only the Amplified version of God s Word  per Nehemiah 8 8 instructed even with their vocab larger than English '],
      dtype=object)

### Things we can observe from the content/tweets columns:

We can clearly see that there are some discrepancies in the content file like https, or random characters such as  '@' (for tags) and '#' for hashtags which are commonly used in tweets in the middle so we need to clean it.

We can convert the text to lowercase so that it is standardized and easy to work with (It can converted either to upper or lower case)

We can also remove Stopwords - this is in accordance with standard nlp practices so that we don't waste time on redundant stop words and give it due importance.

In [5]:
df.shape

(12014, 3)

In [6]:
df.dtypes

content             object
account_category    object
troll                int64
dtype: object

### Let's first start cleaning up

In [7]:
df.isnull().sum().sort_values(ascending = False)

troll               0
account_category    0
content             0
dtype: int64

The sample data seems clean so now we will looking at cleaning 'content' column so its easy to use

In [8]:
# Cleaning up the tweets column in our dataframe
def clean_text(df, content_field):
    df[content_field] = df[content_field].str.replace(r"http\S+", "")
    df[content_field] = df[content_field].str.replace(r"http", "")
    df[content_field] = df[content_field].str.replace(r"@\S+", "")
    df[content_field] = df[content_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[content_field] = df[content_field].str.replace(r"@", "at")
    df[content_field] = df[content_field].str.lower()
    return df

df = clean_text(df, "content")

#Additional cleaning with stopwords
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``'] # '...' as seen from the unique 

def stopwords_removed(data):
    tokens = word_tokenize(data)
    stopwords_removed = [token for token in tokens if token not in stopwords_list]
    return stopwords_removed   

df['tokens'] = df['content'].apply(stopwords_removed)
df['text'] = df['tokens'].apply(' '.join)


In [9]:
#Further clean/check
df = df.reset_index()
df.drop(['content','index'], axis=1, inplace=True)
df.head()

Unnamed: 0,account_category,troll,tokens,text
0,RightTroll,1,"[wonder, nfl, players, kneeling, push, false, ...",wonder nfl players kneeling push false narrati...
1,RightTroll,1,"[realdonaldtrump, worry, silent, majority, ele...",realdonaldtrump worry silent majority elected ...
2,RightTroll,1,"[roni, k, patriot, happy]",roni k patriot happy
3,NonEnglish,0,"[merkel, si, prepara, incontrare, trump, anche...",merkel si prepara incontrare trump anche legge...
4,NonEnglish,0,"[salute, ecco, la, nuova, lista, delle, cure, ...",salute ecco la nuova lista delle cure gratuite...


In [10]:
y = df.troll
df=df.drop('troll',axis =1)

# countvectorizer

In [11]:
vec = CountVectorizer(max_features=1000)
transformed_df = vec.fit_transform(df["text"])
X= transformed_df

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Train a multinomial naïve Bayes classifier on countvectorizer

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from sklearn.naive_bayes import MultinomialNB

folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = MultinomialNB()
    deci_pred = gnb.fit(X_train, y_train).predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print("Kappa score:", cohen_kappa_score(deci_pred, y_test))
    print("")
    
# print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Average Accuracy score:", gnb.score(X_test,y_test))
print("Average Kappa score:", cohen_kappa_score(deci_pred, y_test,weights='quadratic'))

For fold 1
Accuracy score: 0.8062583222370173
Kappa score: 0.6078526017732144

For fold 2
Accuracy score: 0.7250332889480693
Kappa score: 0.4445355802895852

For fold 3
Accuracy score: 0.753994673768309
Kappa score: 0.5030756199590276

For fold 4
Accuracy score: 0.8554297135243171
Kappa score: 0.702792482511341

Average Accuracy score: 0.8554297135243171
Average Kappa score: 0.702792482511341


## Train a Random Forest Classifier on countvectorizer

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
 

folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df
for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rfc =  RandomForestClassifier(n_estimators=100, verbose=True, min_samples_split = 5 , criterion = 'entropy' , n_jobs = 20)
    deci_pred = rfc.fit(X_train, y_train).predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print("Kappa score:", cohen_kappa_score(deci_pred, y_test))
    print("")
    
# print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Average Accuracy score:", rfc.score(X_test,y_test))
print("Average Kappa score:", cohen_kappa_score(deci_pred, y_test,weights='quadratic'))

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.


For fold 1
Accuracy score: 0.848202396804261
Kappa score: 0.5752923806541095



[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


For fold 2
Accuracy score: 0.7886151797603196
Kappa score: 0.4633864221221956



[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


For fold 3
Accuracy score: 0.8278961384820239
Kappa score: 0.4998184385002489



[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s


For fold 4
Accuracy score: 0.8554297135243171
Kappa score: 0.6824313763510327

Average Accuracy score: 0.8444370419720186
Average Kappa score: 0.6824313763510327


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


The accuracy score obtained from both classifiers is high around 0.84 and is comparable to each other. 

#  Tfidvectorizer

In [15]:
vec = TfidfVectorizer(max_features=1000,max_df = 10)
transformed_df = vec.fit_transform(df["text"])
X = transformed_df

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Train a multinomial naïve Bayes classifier 

In [17]:
gnb = MultinomialNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Average Accuracy score:", gnb.score(X_test,y_test))

Number of mislabeled points out of a total 3605 points : 1197
Average Accuracy score: 0.6679611650485436


## Train a multinomial naïve Bayes classifier with 4-fold cross-validation

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from sklearn.naive_bayes import MultinomialNB

folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = MultinomialNB()
    deci_pred = gnb.fit(X_train, y_train).predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print("Kappa score:", cohen_kappa_score(deci_pred, y_test))
    print("")
    
# print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Average Accuracy score:", gnb.score(X_test,y_test))
print("Average Kappa score:", cohen_kappa_score(deci_pred, y_test,weights='quadratic'))

For fold 1
Accuracy score: 0.6617842876165113
Kappa score: 0.27867786220861746

For fold 2
Accuracy score: 0.631491344873502
Kappa score: 0.21296860799757655

For fold 3
Accuracy score: 0.644474034620506
Kappa score: 0.24730997126506893

For fold 4
Accuracy score: 0.6562291805463024
Kappa score: 0.26860175503622685

Average Accuracy score: 0.6562291805463024
Average Kappa score: 0.26860175503622685


## Train/test using Random Forest Classifier 

In [19]:
X= transformed_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [20]:
# Using Random Forest Classifier

rfc =  RandomForestClassifier(n_estimators=100, verbose=True, min_samples_split = 5 , criterion = 'entropy' , n_jobs = 20)
#Fitting a Random Forest Classifier
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print("Average Accuracy score:", rfc.score(X_test,y_test))

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


Average Accuracy score: 0.6638002773925104


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished


## Train/test using Random Forest Classifier with 4-fold cross-validation

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
 

folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rfc =  RandomForestClassifier(n_estimators=100, verbose=True, min_samples_split = 5 , criterion = 'entropy' , n_jobs = 20)
    deci_pred = rfc.fit(X_train, y_train).predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", rfc.score(X_test,y_test))
    print("Kappa score:", cohen_kappa_score(deci_pred, y_test))
    print("")
    
# print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Average Accuracy score:", rfc.score(X_test,y_test))
print("Average Kappa score:", cohen_kappa_score(deci_pred, y_test,weights='quadratic'))

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s


For fold 1


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


Accuracy score: 0.6701065246338216
Kappa score: 0.30264888872231177



[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished


For fold 2


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished


Accuracy score: 0.6291611185086551
Kappa score: 0.21889302393325505



[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


For fold 3


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


Accuracy score: 0.6451398135818908
Kappa score: 0.2577386305688193



[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


For fold 4


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s


Accuracy score: 0.6625582944703531
Kappa score: 0.29037820814678983

Average Accuracy score: 0.6625582944703531
Average Kappa score: 0.29037820814678983


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.2s finished


It is surprising to find that after using TFid vectorizer the accuracy score has reduced considerably for both the classifiers roughly around 0.65.

# Loading the main dataset of tweets :

In [22]:
m_df = pd.read_csv('Assignment_4_data/IRAhandle_master_data.csv', sep=',' , encoding='latin-1', low_memory=False)

# Use latin-1 as encoding since it was throwing 'UnicodeDecodeError, invalid continuation byte'
m_df

Unnamed: 0.1,Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,1,9.06e+17,10_GOP,We have a sitting Democrat US Senator on tria...,Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,...,Right,0,RightTroll,0,905874659358453760,9.145804e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,
1,2,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,...,Right,0,RightTroll,0,905874659358453760,9.146218e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,
2,3,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,...,Right,1,RightTroll,0,905874659358453760,9.146235e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,
3,4,9.06e+17,10_GOP,JUST IN President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,...,Right,0,RightTroll,0,905874659358453760,9.146391e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914639143690...,,
4,5,9.06e+17,10_GOP,19 000 RESPECTING our National Anthem StandF...,Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,...,Right,1,RightTroll,0,905874659358453760,9.143122e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/realDonaldTrump/status/914...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201325,1201326,2535818742,HAPPKENDRAHAPPY,Thank you so much and I hope you have a great ...,United States,English,9/23/2016 17:05,9/23/2016 17:05,1311,1688,...,Right,1,RightTroll,0,2535818742,7.793661e+17,http://twitter.com/happkendrahappy/statuses/77...,https://twitter.com/Patriotancestry/status/779...,,
1201326,1201327,2535818742,HAPPKENDRAHAPPY,OutnumberedFNC ericmtyson I d start watchi...,United States,English,9/23/2016 17:05,9/23/2016 17:05,1311,1688,...,Right,1,RightTroll,0,2535818742,7.793661e+17,http://twitter.com/happkendrahappy/statuses/77...,,,
1201327,1201328,2535818742,HAPPKENDRAHAPPY,7 Ways to Discover Alien Planets https t c...,United States,English,9/23/2016 17:05,9/23/2016 17:05,1311,1688,...,Right,1,RightTroll,0,2535818742,7.793661e+17,http://twitter.com/happkendrahappy/statuses/77...,https://twitter.com/SPACEdotcom/status/7793612...,http://dlvr.it/MKNTzd,
1201328,1201329,2535818742,HAPPKENDRAHAPPY,Video shows woman shooting at burglars during ...,United States,English,9/23/2016 17:06,9/23/2016 17:06,1311,1688,...,Right,1,RightTroll,0,2535818742,7.793663e+17,http://twitter.com/happkendrahappy/statuses/77...,http://2wsb.tv/2cp6Kll,,


In [23]:
m_df.account_category.value_counts()

RightTroll      367871
NonEnglish      238452
LeftTroll       177323
NewsFeed        157809
HashtagGamer    132389
Commercial      120699
Fearmonger        4794
Unknown           1993
Name: account_category, dtype: int64

## Adding the troll column in the master dataset 

In [24]:
m_df['troll'] = np.where((m_df['account_category'] == 'RightTroll') | (m_df['account_category'] == 'LeftTroll'), 1, 0)
m_df.head()

Unnamed: 0.1,Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,...,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1,troll
0,1,9.06e+17,10_GOP,We have a sitting Democrat US Senator on tria...,Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,...,0,RightTroll,0,905874659358453760,9.145804e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,,1
1,2,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,...,0,RightTroll,0,905874659358453760,9.146218e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,,1
2,3,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,...,1,RightTroll,0,905874659358453760,9.146235e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,,1
3,4,9.06e+17,10_GOP,JUST IN President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,...,0,RightTroll,0,905874659358453760,9.146391e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914639143690...,,,1
4,5,9.06e+17,10_GOP,19 000 RESPECTING our National Anthem StandF...,Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,...,1,RightTroll,0,905874659358453760,9.143122e+17,http://twitter.com/905874659358453760/statuses...,https://twitter.com/realDonaldTrump/status/914...,,,1


In [25]:
# To check if troll column has unique values
m_df.troll.value_counts()

0    656136
1    545194
Name: troll, dtype: int64

From the value_counts above we can see that there are lot of values for account categories so let's club these values HashtagGamer, Commercial, Fearmonger, Unknown and NonEnglish into 'Other' since they haven't been prominant in the exploratory phase.

In [26]:
#renaming account_categories
m_df['account_category'].replace({'HashtagGamer': 'Other','NonEnglish': 'Other', 'Unknown': 'Other', 
                            'Fearmonger': 'Other', 'Commercial': 'Other'}, inplace = True)
print(m_df.shape)
m_df.account_category.value_counts(normalize=True)

(1201330, 23)


Other         0.414813
RightTroll    0.306220
LeftTroll     0.147606
NewsFeed      0.131362
Name: account_category, dtype: float64

## Data pre-processing

In [27]:
m_df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,...,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1,troll
count,1201330.0,1201330.0,1201330,1201330,1199500,1201330,1201330,1201330,1201330.0,1201330.0,...,1201330.0,1201330,1201330.0,1201330.0,1201330.0,1201330,903381,289681,8168,1201330.0
unique,,911.0,1061,1044711,25,56,462046,471589,,,...,,4,,1065.0,,1201330,717152,205488,7172,
top,,8.92e+17,EXQUOTE,Ð Ð Ð Ñ Ð Ð Ðµ Ð Ð Ñ Ð ÐÐ Ð Ð Ð Ð Ð Ð Ð â...,United States,English,8/15/2017 17:01,12/29/2016 4:01,,,...,,Other,,3272640600.0,,http://twitter.com/891941754282188801/statuses...,https://twibble.io,https://twibble.io,http://dlvr.it/PQdNjm,
freq,,64365.0,59174,201,845288,952915,121,299,,,...,,498327,,45886.0,,1,1747,5167,65,
mean,600665.5,,,,,,,,2569.479,4273.418,...,0.4553337,,0.2542099,,7.687101e+17,,,,,0.4538253
std,346794.2,,,,,,,,3740.071,7572.6,...,0.4980011,,0.4354163,,1.045932e+17,,,,,0.4978635
min,1.0,,,,,,,,-1.0,-1.0,...,0.0,,0.0,,1.666183e+17,,,,,0.0
25%,300333.2,,,,,,,,225.0,251.0,...,0.0,,0.0,,6.690479e+17,,,,,0.0
50%,600665.5,,,,,,,,1273.0,888.0,...,0.0,,0.0,,7.850686e+17,,,,,0.0
75%,900997.8,,,,,,,,3148.0,2721.0,...,1.0,,1.0,,8.570004e+17,,,,,1.0


I will remove the "harvested_date" since I don't need it for my analysis and keep 'publish_date' since one datetime column is enough.
We can also see the addition of 'Unnamed: 0' is done when file is imported ; it is just a replication of the index so I can remove that.

Using the describe function, we can also see which columns are unique or not ; or how many unique values compared to total dataset. 

There are roughly 15% tweets which are duplicates since there are only 1044711 unique tweets from 'content' column. However, these 'unique' tweets could be retweets which contain the same information or text. We can use this column to see the weight given to each word as done above on the sample dataset.

It also looks like there are some NaN values for "external_author_id", which is the author account ID from Twitter. Since we already have the Twitter Handle as the "author" column for each tweet we won't be using the "external_author_id" in this analysis and will remove this field from the dataset.



In [28]:
#dropping columns that won't be used
m_df = m_df.drop(['harvested_date', 'Unnamed: 0', 'external_author_id'], axis=1)

In [29]:
m_df.dtypes

author               object
content              object
region               object
language             object
publish_date         object
following             int64
followers             int64
updates               int64
post_type            object
account_type         object
retweet               int64
account_category     object
new_june_2018         int64
alt_external_id      object
tweet_id            float64
article_url          object
tco1_step1           object
tco2_step1           object
tco3_step1           object
troll                 int64
dtype: object

In [30]:
## Let's look at the languages of the tweets:

m_df.language.value_counts(normalize=True).head() 

English      0.793217
Russian      0.124268
German       0.042542
Italian      0.011054
Ukrainian    0.008257
Name: language, dtype: float64

80% of all tweets are in english. For ease of research I'll only keep the tweets that are english language.

In [31]:
#Update data to only contain 'English' tweets
m_df = m_df.loc[m_df.language == 'English']
print(m_df.shape)

(952915, 20)


In [32]:
m_df.drop(['language'], axis = 1, inplace = True)

In [33]:
#Let's see if there is any missing data in the dataset
m_df.isna().sum().sort_values(ascending = False)

tco3_step1          945022
tco2_step1          744342
post_type           557435
tco1_step1          271244
region                 511
troll                    0
content                  0
publish_date             0
following                0
followers                0
updates                  0
retweet                  0
account_type             0
account_category         0
new_june_2018            0
alt_external_id          0
tweet_id                 0
article_url              0
author                   0
dtype: int64

In [34]:
# 511 missing values are there for region so let's look at that in detail

#value count of region
m_df.region.value_counts(normalize=True)

United States           0.829662
Unknown                 0.166414
Azerbaijan              0.001414
United Kingdom          0.000835
Germany                 0.000445
Italy                   0.000348
Russian Federation      0.000246
Iraq                    0.000173
Afghanistan             0.000157
Belarus                 0.000106
Israel                  0.000073
United Arab Emirates    0.000058
Ukraine                 0.000028
Egypt                   0.000022
France                  0.000012
Malaysia                0.000004
Serbia                  0.000002
Hong Kong               0.000001
Name: region, dtype: float64

83% of region values are "United States" and 17% of region values are "Unknown". Since all the missing values here do speak English, I'll replace the Nan with Unknown

In [35]:
#rename region nan values to 'unknown' because it already has 'unknown' as a value. So it is better to label them \
# as unknown than to remove the nan values.

m_df['region'].fillna(value='Unknown', inplace = True)
m_df.region.unique()

array(['Unknown', 'United States', 'Italy', 'United Arab Emirates',
       'Israel', 'Azerbaijan', 'United Kingdom', 'Russian Federation',
       'Iraq', 'Germany', 'France', 'Ukraine', 'Serbia', 'Egypt',
       'Hong Kong', 'Belarus', 'Malaysia', 'Afghanistan'], dtype=object)

In [36]:
m_df.post_type.value_counts(normalize=True)

RETWEET        0.938922
QUOTE_TWEET    0.061078
Name: post_type, dtype: float64

In [37]:
m_df.post_type.unique()

array([nan, 'RETWEET', 'QUOTE_TWEET'], dtype=object)

In [38]:
#there is no relation between the retweet value and nan values so we can update the field using 

m_df['post_type'].fillna(value='NOT_RETWEET', inplace = True)
m_df.post_type.unique()

array(['NOT_RETWEET', 'RETWEET', 'QUOTE_TWEET'], dtype=object)

It appears that the nans are not retweets. Therefore, since this isn't actually missing data we can replace the NaN values under post_type with NOT_RETWEET

In [39]:
#confirm there is no more missing data
m_df.isna().sum().sort_values(ascending = False)

tco3_step1          945022
tco2_step1          744342
tco1_step1          271244
troll                    0
post_type                0
content                  0
region                   0
publish_date             0
following                0
followers                0
updates                  0
retweet                  0
account_type             0
account_category         0
new_june_2018            0
alt_external_id          0
tweet_id                 0
article_url              0
author                   0
dtype: int64

The columns 'tco1_step1' , 'tco_step1', 'tco3_step1' are nothing but URLs so we can remove them also from the dataset

In [40]:
## Feature selection using chi squared -- regularization

In [41]:
# We will sample the data randomly as the file is too large and was causing my system to crash
# This code will run even if the values are not sample but for simplicity I will sample it.

m_df = m_df.sample(n= 30000)
m_df.troll.unique()

array([0, 1])

In [42]:
start_date_tweet = m_df['publish_date'].min()
end_date_tweet = m_df['publish_date'].max()

print(start_date_tweet, end_date_tweet)

1/1/2016 0:30 9/9/2017 3:07


We have almost 4 years of tweets starting 1st January 2013 until the 9th of September 2017. Time also appears with these dates, so let's create a new column to hold only the date component of this.

In [43]:
# Make date as datetime publish_date by creating a string column
m_df['publish_date'] = pd.to_datetime(m_df['publish_date'])

m_df['publish_date'] = pd.to_datetime(m_df['publish_date'].dt.date)

In [44]:
# Cleaning up the tweets column in our dataframe
def clean_text(df, content_field):
    df[content_field] = df[content_field].str.replace(r"http\S+", "")
    df[content_field] = df[content_field].str.replace(r"http", "")
    df[content_field] = df[content_field].str.replace(r"@\S+", "")
    df[content_field] = df[content_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[content_field] = df[content_field].str.replace(r"@", "at")
    df[content_field] = df[content_field].str.lower()
    return df

m_df2 = clean_text(m_df, "content")

#Additional cleaning with stopwords
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``'] # '...' as seen from the unique 

def stopwords_removed(data):
    tokens = word_tokenize(data)
    stopwords_removed = [token for token in tokens if token not in stopwords_list]
    return stopwords_removed   

m_df2['tokens'] = m_df2['content'].apply(stopwords_removed)
m_df2['text'] = m_df2['tokens'].apply(' '.join)

In [45]:
#Further clean/check
m_df2 = m_df2.reset_index()
m_df2.drop(['content'], axis=1, inplace=True)
m_df2.head()

Unnamed: 0,index,author,region,publish_date,following,followers,updates,post_type,account_type,retweet,...,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1,troll,tokens,text
0,757329,DAILYSANFRAN,United States,2017-06-01,6099,17514,42080,NOT_RETWEET,local,0,...,0,2495567768,8.704203e+17,http://twitter.com/2495567768/statuses/8704203...,https://twitter.com/DailySanFran/status/870420...,http://kron4.com/2017/06/01/minor-arrested-in-...,,0,"[minor, arrested, string, bart, auto, burglari...",minor arrested string bart auto burglaries co ...
1,874869,DICKYIRWIN,United States,2016-12-05,3265,3440,4232,RETWEET,Hashtager,1,...,0,2598367019,8.057893e+17,http://twitter.com/2598367019/statuses/8057892...,,,,0,"[time, sucking, assholes, tsa, alternativeacro...",time sucking assholes tsa alternativeacronymin...
2,842418,DEBORRTH,Unknown,2017-07-31,1983,470,264,NOT_RETWEET,Right,0,...,0,890488664215756800,8.920326e+17,http://twitter.com/890488664215756801/statuses...,,,,1,"[lost, azredhen, nice, air, drywall, guy, love]",lost azredhen nice air drywall guy love
3,296400,BEN_SAR_GENT,United States,2015-09-15,128,11,182,RETWEET,Right,1,...,0,2580772991,6.437198e+17,http://twitter.com/Ben_Sar_Gent/statuses/64371...,https://twitter.com/Libertea2012/status/643383...,,,1,"[hillaryclinton, hardly, put, jail, though, e,...",hillaryclinton hardly put jail though e mail s...
4,203247,ARCHIEOLIVERS,United States,2016-12-24,3357,2622,2027,RETWEET,Right,1,...,0,1686370159,8.125507e+17,http://twitter.com/1686370159/statuses/8125506...,http://ift.tt/2i1QXxl,,,1,"[news, post, ex, campaign, aides, building, pr...",news post ex campaign aides building pro trump...


# Countvectorizer

In [46]:
vec = CountVectorizer(max_features=4000)
transformed_df = vec.fit_transform(m_df2["text"])
transformed_df.toarray().shape

(30000, 4000)

In [47]:
y = m_df2[['troll']].to_numpy()
y = np.squeeze(y)
y.shape

(30000,)

##  Train a multinomial naïve Bayes classifier on Countvectorizer

In [48]:
from sklearn import metrics
gnb = MultinomialNB(alpha=.01)
gnb.fit(transformed_df, y)
vectors_test = vec.transform(m_df2["text"])
y_pred = gnb.predict(vectors_test)
# y_pred = gnb.predict(m_df2["text"])
# print("Average Accuracy score:", gnb.score(vectors_test,y))
f1_score = metrics.f1_score(y, y_pred, average='macro')
acc_score = metrics.accuracy_score(y, y_pred)
print(f1_score)


0.843231957285526


In [49]:
folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df.toarray()

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = MultinomialNB()
#     print(type(X))
#     print(np.any(np.isnan(X_train)))
    gnb.fit(X_train, y_train)
    deci_pred = gnb.predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print()
    print("")
    
print("Average Accuracy score:", gnb.score(X_test,y_test))
print ("Classification report = \n",classification_report(y_test, deci_pred))


For fold 1
Accuracy score: 0.819890681242501


For fold 2
Accuracy score: 0.8165333333333333


For fold 3
Accuracy score: 0.8174666666666667


For fold 4
Accuracy score: 0.820776103480464


Average Accuracy score: 0.820776103480464
Classification report = 
               precision    recall  f1-score   support

           0       0.79      0.80      0.79      3252
           1       0.84      0.84      0.84      4247

    accuracy                           0.82      7499
   macro avg       0.82      0.82      0.82      7499
weighted avg       0.82      0.82      0.82      7499



##  Train a Random Forest classifier on Countvectorizer

In [50]:
folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df.toarray()

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = RandomForestClassifier()
#     print(type(X))
#     print(np.any(np.isnan(X_train)))
    gnb.fit(X_train, y_train)
    deci_pred = gnb.predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print()
    print("")
    
print("Average Accuracy score:", gnb.score(X_test,y_test))
print ("Classification report  = \n",classification_report(y_test, deci_pred))





For fold 1
Accuracy score: 0.8118917477669644






For fold 2
Accuracy score: 0.8126666666666666






For fold 3
Accuracy score: 0.8090666666666667






For fold 4
Accuracy score: 0.8075743432457662


Average Accuracy score: 0.8075743432457662
Classification report  = 
               precision    recall  f1-score   support

           0       0.77      0.79      0.78      3252
           1       0.84      0.82      0.83      4247

    accuracy                           0.81      7499
   macro avg       0.80      0.81      0.80      7499
weighted avg       0.81      0.81      0.81      7499



The accuracy score of both the classifiers is again around 0.81. This is less accurate than the one for sample data but it is still a good value.

# Tfidvectorizer

In [51]:
vec = TfidfVectorizer(max_features=40000,max_df = 10)
transformed_df = vec.fit_transform(m_df2["text"])
transformed_df.toarray().shape

(30000, 40000)

In [52]:
X= transformed_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Using Random Forest Classifier

rfc = MultinomialNB()

#Fitting a Random Forest Classifier
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print("Average Accuracy score:", rfc.score(X_test,y_test))

Average Accuracy score: 0.6843333333333333


##  Train a multinomial naïve Bayes classifier on Tfidvectorizer

In [53]:
folds = StratifiedKFold(n_splits=4)
i=0

X = transformed_df.toarray()

for train_index, test_index in folds.split(X,y):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = MultinomialNB()
#     print(type(X))
#     print(np.any(np.isnan(X_train)))
    gnb.fit(X_train, y_train)
    deci_pred = gnb.predict(X_test)
    print("For fold {}".format(i))
    print("Accuracy score:", gnb.score(X_test,y_test))
    print()
    print("")
    
print("Average Accuracy score:", gnb.score(X_test,y_test))
print ("Classification report ",classification_report(y_test, deci_pred))



For fold 1
Accuracy score: 0.6885748566857752


For fold 2
Accuracy score: 0.6866666666666666


For fold 3
Accuracy score: 0.6874666666666667


For fold 4
Accuracy score: 0.6803573809841312


Average Accuracy score: 0.6803573809841312
Classification report                precision    recall  f1-score   support

           0       0.74      0.40      0.52      3252
           1       0.66      0.89      0.76      4247

    accuracy                           0.68      7499
   macro avg       0.70      0.65      0.64      7499
weighted avg       0.70      0.68      0.66      7499



##  Train a Random Forest classifier on Tfidvectorizer

In [54]:
# Using Random Forest Classifier

from sklearn import metrics
rfc = RandomForestClassifier(n_estimators=10, verbose=True , n_jobs =3 )
rfc.fit(transformed_df, y)
vectors_test = vec.transform(m_df2["text"])
y_pred = gnb.predict(vectors_test)
# y_pred = gnb.predict(m_df2["text"])
# print("Average Accuracy score:", gnb.score(vectors_test,y))
f1_score = metrics.f1_score(y, y_pred, average='macro')
acc_score = metrics.accuracy_score(y, y_pred)
print(f1_score)
print(acc_score)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    4.2s finished


0.8436403241821431
0.8526666666666667


In [55]:
X= transformed_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Using Random Forest Classifier

rfc = RandomForestClassifier(n_estimators=10, verbose=True , n_jobs =3)

#Fitting a Random Forest Classifier
y_pred = rfc.fit(X_train, y_train).predict(X_test)
print("Average Accuracy score:", rfc.score(X_test,y_test))

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:   10.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    1.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


Average Accuracy score: 0.6633333333333333


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    1.2s finished


Using the train test split the accuracy score of Multinomial Gaussian Naive Bayes is much better than Random Forest Classifier. Using cross-validation as expected we get better values for the accuracy score.

Reference :

https://www.ahmedbesbes.com/blog/sentiment-analysis-with-keras-and-word-2-vec

https://machinelearningmastery.com/clean-text-machine-learning-python/

