In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import html.parser as htmlParser
import re
from datetime import datetime
from langdetect import detect
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

In [2]:
np.random.seed(1)

Open data file for Charlottesville event and add some useful columns to the dataframe

In [3]:
# open file
cVille_df = pd.read_json('./Datasets/charlottesville.json')

# add ids
cVille_df['id'] = [url.rsplit('/', 1)[-1] for url in cVille_df['permalink']]

# get number of mentions instead of actual values
cVille_df['mention_count'] = [(len(mentions.split(' ')) if mentions != "" else 0) for mentions in cVille_df['mentions']]

# add hashtag count column as well
cVille_df['hashtag_count'] = [(len(hashtags.split(' ')) if hashtags != "" else 0)  for hashtags in cVille_df['hashtags'] ]

# drop username and geo
cVille_df = cVille_df.drop(['geo', 'username', 'permalink', 'mentions'], axis=1)

# add lang column
cVille_df['lang'] = [detect(text) for text in cVille_df['text']]

# drop non-english rows
cVille_df = cVille_df[cVille_df.lang == 'en']

Open data file for WomenMarch event and add some useful columns to the dataframe

In [4]:
# open file
womenMarch_df = pd.read_json('./Datasets/womenmarch.json')

# add ids
womenMarch_df['id'] = [url.rsplit('/', 1)[-1] for url in womenMarch_df['permalink']]

# get number of mentions instead of actual values
womenMarch_df['mention_count'] = [(len(mentions.split(' ')) if mentions != "" else 0) for mentions in womenMarch_df['mentions']]

# add hashtag count column as well
womenMarch_df['hashtag_count'] = [(len(hashtags.split(' ')) if hashtags != "" else 0)  for hashtags in womenMarch_df['hashtags'] ]

# drop username and geo
womenMarch_df = womenMarch_df.drop(['geo', 'username', 'permalink', 'mentions'], axis=1)

# add lang column
womenMarch_df['lang'] = [detect(text) for text in womenMarch_df['text']]

# drop non-english rows
womenMarch_df = womenMarch_df[womenMarch_df.lang == 'en']

Some more preprocessing of data.

In [5]:
def containsChar(text, pattern):
    for word in text.split(' '):
        if word == pattern: 
            return True
    return False

Preprocess data function

In [6]:
# This method preprocesses the data in given dataframe
# wasViolent = True indicates violent event and False indicates non-violent event 

def preprocessData(df, wasViolent):
    
    # convert html elements to readable format (also convert text to lowercase)
    filteredText = [htmlParser.unescape(text.lower()) for text in df['text']]

    # remove urls (because urls conatain spaces that make it difficult, this will stop at ellipsis or # or @ or end of line)
    # not 100% accurate but decent approach
    url_pattern = re.compile("http(.+?)(@\S+|#\S+|\\u2026|$)")
    filteredText = [re.sub(url_pattern, "", text) for text in filteredText]

    # remove usernames and mentions
    username_pattern = re.compile("@\S+")
    filteredText = [re.sub(username_pattern, "", text) for text in filteredText]

    # remove hashtags
    hashtag_pattern = re.compile("#\S+")
    filteredText = [re.sub(hashtag_pattern, "", text) for text in filteredText]

    # emoticon handling (replace positive_emojis with pos_emoji and negative_emojis with neg_emoji keyword)
    pos_emojis = [':-)', ':)', '(:', '(-:', ':-D', ':D', 'X-D', 'XD', 'xD', '<3', ':*', ';-)', ';)', ';-D', ';D', '(;', '(-;']
    filteredText = [' '.join([('pos_emoji' if word in pos_emojis else word) for word in text.split(' ')]) for text in filteredText]

    neg_emojis = [":-(", ":(", "):", ")-:", ":,(", ":'(", ':"(', ":(("]
    filteredText = [' '.join([('neg_emoji' if word in neg_emojis else word) for word in text.split(' ')]) for text in filteredText]

    # handle punctuations (add punc_excl=1 if ! exists else 0, same with punc_ques and ?)
    df['punc_excl'] = [(1 if containsChar(text, '!') else 0) for text in filteredText]
    df['punc_ques'] = [(1 if containsChar(text, '?') else 0) for text in filteredText]
    
    # remove pictures
    pic_pattern = re.compile("pic.twitter.com(.+)(/s|$)")
    filteredText = [re.sub(pic_pattern, "", text) for text in filteredText]

    # remove question marks and dollar signs
    removableChars = ['?', '$']
    filteredText = [' '.join([('' if word in removableChars else word) for word in text.split(' ')]) for text in filteredText]

    # remove random words (if still there is : or emoji then ignore them)
    random_pattern = re.compile("(&|:|!|,|;|)", re.I)
    filteredText = [re.sub(random_pattern, "", text) for text in filteredText]

    # handle repeating characters (replace characters repeating more than twice as two characters)
    repeated_pattern = re.compile(r"(.)\1{1,}")
    filteredText = [re.sub(repeated_pattern, r'\1\1', text) for text in filteredText]

    # add label
    df['violent'] = [1 if wasViolent else 0] * df.shape[0]

    # add filteredText column to dataframe
    df['filteredText'] = filteredText
    
    return df[['id', 'date', 'filteredText', 'hashtags', 'favorites', 'retweets', 'mention_count', 'hashtag_count', 'punc_excl', 'punc_ques', 'violent']]

Check number of valid samples in English

In [7]:
print(cVille_df.shape)
print(womenMarch_df.shape)

(153943, 9)
(109958, 9)


Preprocess both Charlottesville and WomenMarch data

In [10]:
cVille_preprocessed_df = preprocessData(cVille_df, wasViolent=True)
womenMarch_preprocessed_df = preprocessData(womenMarch_df, wasViolent=False)

In [11]:
cVille_preprocessed_df.head()

Unnamed: 0,id,date,filteredText,hashtags,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,violent
0,884925177917845504,2017-07-11,de blasio’s trip was approved for ‘city purpos...,#commies #antifa,0,2,0,2,0,0,1
1,884924281951576064,2017-07-11,alt-right trump-supporting neo-nazi confirmed.,,0,0,0,0,0,0,1
2,884923880938328066,2017-07-11,this. kills me. i remember seeing around duri...,#RobertGrodt #Occupy #BijiRojava #antifa,7,4,0,4,0,0,1
3,884923588960280576,2017-07-11,is resorting to violence and unrest to silenc...,#antifa,0,0,0,1,0,0,1
4,884922175748059136,2017-07-11,neo-nazi july 12th internet-wide day of act...,#Politics #Collusion #NetNeutrality,0,1,1,3,0,0,1


In [12]:
womenMarch_preprocessed_df.head()

Unnamed: 0,id,date,filteredText,hashtags,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,violent
0,921443913398972416,2017-10-20,can we make our message any clearer?,#MeToo #EndRapeCulture #NotOkay #WhyIMarch,0,1,0,4,0,0,0
1,921204515738546176,2017-10-19,anyone who doesn't support affordable is not ...,#birthcontrol #prolife #ProofIsInThePudding #W...,1,1,0,5,0,0,0
2,921100053720256514,2017-10-19,each and every day for,#WhyIMarch #RacialJustice #M4RJ #BlackLivesMatter,0,2,0,4,0,0,0
3,920843500781953024,2017-10-18,because deserve a livable future.,#whyimarch #youth,0,1,1,2,0,0,0
4,920765197605855232,2017-10-18,started a conversation with my daughter . i ...,#MeToo #WhyIMarch #ShePersisted,2,3,1,3,0,0,0


In [13]:
# write preprocessed data back to to the disk
cVille_preprocessed_df.to_csv('./Datasets/charlottesville_preprocessed.csv', sep='\t', index=False)
womenMarch_preprocessed_df.to_csv('./Datasets/womenMarch_preprocessed.csv', sep='\t', index=False)

Read Charlottesville and Women March preprocessed data from the disk

In [14]:
cVille_preprocessed_df = pd.read_csv('./Datasets/charlottesville_preprocessed.csv', sep='\t')
womenMarch_preprocessed_df = pd.read_csv('./Datasets/womenMarch_preprocessed.csv', sep='\t')

Handle NaN values. Replace them with empty string in filteredText column

In [15]:
cVille_preprocessed_df.info()
cVille_preprocessed_df['filteredText'].fillna("", inplace=True)
cVille_preprocessed_df['hashtags'].fillna("", inplace=True)
cVille_preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153943 entries, 0 to 153942
Data columns (total 11 columns):
id               153943 non-null int64
date             153943 non-null object
filteredText     153881 non-null object
hashtags         143906 non-null object
favorites        153943 non-null int64
retweets         153943 non-null int64
mention_count    153943 non-null int64
hashtag_count    153943 non-null int64
punc_excl        153943 non-null int64
punc_ques        153943 non-null int64
violent          153943 non-null int64
dtypes: int64(8), object(3)
memory usage: 12.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153943 entries, 0 to 153942
Data columns (total 11 columns):
id               153943 non-null int64
date             153943 non-null object
filteredText     153943 non-null object
hashtags         153943 non-null object
favorites        153943 non-null int64
retweets         153943 non-null int64
mention_count    153943 non-null int64
hashtag_count    15

In [16]:
womenMarch_preprocessed_df.info()
womenMarch_preprocessed_df['hashtags'].fillna("", inplace=True)
womenMarch_preprocessed_df['filteredText'].fillna("", inplace=True)
womenMarch_preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109958 entries, 0 to 109957
Data columns (total 11 columns):
id               109958 non-null int64
date             109958 non-null object
filteredText     109818 non-null object
hashtags         109958 non-null object
favorites        109958 non-null int64
retweets         109958 non-null int64
mention_count    109958 non-null int64
hashtag_count    109958 non-null int64
punc_excl        109958 non-null int64
punc_ques        109958 non-null int64
violent          109958 non-null int64
dtypes: int64(8), object(3)
memory usage: 9.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109958 entries, 0 to 109957
Data columns (total 11 columns):
id               109958 non-null int64
date             109958 non-null object
filteredText     109958 non-null object
hashtags         109958 non-null object
favorites        109958 non-null int64
retweets         109958 non-null int64
mention_count    109958 non-null int64
hashtag_count    109

In [17]:
womenMarch_preprocessed_df.head()

Unnamed: 0,id,date,filteredText,hashtags,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,violent
0,921443913398972416,2017-10-20,can we make our message any clearer?,#MeToo #EndRapeCulture #NotOkay #WhyIMarch,0,1,0,4,0,0,0
1,921204515738546176,2017-10-19,anyone who doesn't support affordable is not ...,#birthcontrol #prolife #ProofIsInThePudding #W...,1,1,0,5,0,0,0
2,921100053720256514,2017-10-19,each and every day for,#WhyIMarch #RacialJustice #M4RJ #BlackLivesMatter,0,2,0,4,0,0,0
3,920843500781953024,2017-10-18,because deserve a livable future.,#whyimarch #youth,0,1,1,2,0,0,0
4,920765197605855232,2017-10-18,started a conversation with my daughter . i ...,#MeToo #WhyIMarch #ShePersisted,2,3,1,3,0,0,0


In [18]:
# below is part of preprocessing
def feature_reduced_text(text, stop_words, lemmatizer):
    # tokenization
    tokens = word_tokenize(text)

    # remove stopwords from text
    tokens = [token for token in tokens if token not in stop_words]

    # lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # pos tagging
    pos_tags = nltk.pos_tag(tokens)
    
    # return only relevant words (adjective, verb, adverb, noun with all of their variations)
    return ' '.join([tag[0] for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ']])

In [19]:
stops = set(stopwords.words('english'))
# We will use not for negation (may not be perfect but decent enough)
stops.remove('not')
word_lemmatizer = WordNetLemmatizer()

cVille_preprocessed_df['preprocessedText'] = [feature_reduced_text(text, stop_words=stops, lemmatizer=word_lemmatizer) for text in cVille_preprocessed_df['filteredText'].tolist()]
womenMarch_preprocessed_df['preprocessedText'] = [feature_reduced_text(text, stop_words=stops, lemmatizer=word_lemmatizer) for text in womenMarch_preprocessed_df['filteredText'].tolist()]

Start Data transformation (negation handling, sentiment/polarity score and feature extraction)

In [20]:
# Negation handling (replace (isn't, ain't, haven't etc with not. stop words don't matter anyways))

def getNegationHandledText(text):
    negation_pattern = re.compile(r"(\w*)n't ")
    cleanedText = re.sub(negation_pattern, 'not ', text)
    
    # phrase occuring after not (combine all sub phrases to one phrase)
    negationSplit = cleanedText.split(' not ')
    positivePhrase = negationSplit[0]
    negativePhrase = ' '.join(negationSplit[1:])
                       
    # return words with _NEG appended to them
    if negativePhrase:
        return ' '.join([positivePhrase, ' '.join([word + "_NEG" for word in negativePhrase.split(' ') if word != " "])])
    else:
        return text

In [21]:
# add negation handled text to Charlottesville df
cVille_preprocessed_df['negationText'] = [getNegationHandledText(text) for text in cVille_preprocessed_df['preprocessedText']]

# add negation handled text to Women March df
womenMarch_preprocessed_df['negationText'] = [getNegationHandledText(text) for text in womenMarch_preprocessed_df['preprocessedText']]

In [22]:
# Get sentiment score for hashtag sentences
def hashtag_sentiment_score(hashtags):
    # Convert hashtag to words/sentences
    def hashtagToSentence(hashtag):
        return " ".join([word for word in re.split('([A-Z][a-z]+)', hashtag) if word])
    
    scores = [vaderAnalyzer.polarity_scores(hashtagToSentence(hashtag))['compound'] for hashtag in hashtags]
    return scores

def sentiment_score(tweet, analyzer):
    hashtag_scores = hashtag_sentiment_score(tweet['hashtags'].split())
    text_sentiment_score = analyzer.polarity_scores(tweet['preprocessedText'])['compound']
    
    normalized_score = (np.sum(hashtag_scores) + text_sentiment_score) / (len(hashtag_scores) + 1)
    return normalized_score

Calculate sentiment score/polarity calculation. Here, we use two methods to calculate polarity scores.
1. Vader Sentiment Intensity Analyzer that takes care of negation and is built for messy social media data like Twitter
2. Our approach to negate the polarities of words existing after not (not itself is not considered in this case)

In [23]:
# Part 1 - Calculate Vader score for text
vaderAnalyzer = SentimentIntensityAnalyzer()

cVille_preprocessed_df['vaderTextScore'] = [vaderAnalyzer.polarity_scores(text)['compound'] for text in cVille_preprocessed_df['preprocessedText']]
womenMarch_preprocessed_df['vaderTextScore'] = [vaderAnalyzer.polarity_scores(text)['compound'] for text in womenMarch_preprocessed_df['preprocessedText']]

# Part 2 - Calculate _NEG handled score

# Calculate normalized sentiment score with Vader (both text and hashtags)
cVille_preprocessed_df['vaderScore'] = [sentiment_score(row, vaderAnalyzer) for index, row in cVille_preprocessed_df.iterrows()]
womenMarch_preprocessed_df['vaderScore'] = [sentiment_score(row, vaderAnalyzer) for index, row in womenMarch_preprocessed_df.iterrows()]

In [24]:
cVille_preprocessed_df.drop(['negationText'], axis=1, inplace=True)
womenMarch_preprocessed_df.drop(['negationText'], axis=1, inplace=True)

cVille_preprocessed_df.head()

Unnamed: 0,id,date,filteredText,hashtags,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,violent,preprocessedText,vaderTextScore,vaderScore
0,884925177917845504,2017-07-11,de blasio’s trip was approved for ‘city purpos...,#commies #antifa,0,2,0,2,0,0,1,’ trip approved ‘ city purpose ’ bashing trump...,0.4215,0.1405
1,884924281951576064,2017-07-11,alt-right trump-supporting neo-nazi confirmed.,,0,0,0,0,0,0,1,alt-right trump-supporting neo-nazi confirmed,0.0,0.0
2,884923880938328066,2017-07-11,this. kills me. i remember seeing around duri...,#RobertGrodt #Occupy #BijiRojava #antifa,7,4,0,4,0,0,1,kill remember seeing around rest power comrade,-0.6908,-0.13816
3,884923588960280576,2017-07-11,is resorting to violence and unrest to silenc...,#antifa,0,0,0,1,0,0,1,resorting violence unrest silence opposition f...,-0.8271,-0.41355
4,884922175748059136,2017-07-11,neo-nazi july 12th internet-wide day of act...,#Politics #Collusion #NetNeutrality,0,1,1,3,0,0,1,neo-nazi july internet-wide day action save,0.4939,0.123475


In [25]:
womenMarch_preprocessed_df.head()

Unnamed: 0,id,date,filteredText,hashtags,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,violent,preprocessedText,vaderTextScore,vaderScore
0,921443913398972416,2017-10-20,can we make our message any clearer?,#MeToo #EndRapeCulture #NotOkay #WhyIMarch,0,1,0,4,0,0,0,make message clearer,0.0,-0.17206
1,921204515738546176,2017-10-19,anyone who doesn't support affordable is not ...,#birthcontrol #prolife #ProofIsInThePudding #W...,1,1,0,5,0,0,0,anyone n't support affordable not,-0.3089,-0.051483
2,921100053720256514,2017-10-19,each and every day for,#WhyIMarch #RacialJustice #M4RJ #BlackLivesMatter,0,2,0,4,0,0,0,day,0.0,0.1105
3,920843500781953024,2017-10-18,because deserve a livable future.,#whyimarch #youth,0,1,1,2,0,0,0,deserve livable future,0.0,0.0
4,920765197605855232,2017-10-18,started a conversation with my daughter . i ...,#MeToo #WhyIMarch #ShePersisted,2,3,1,3,0,0,0,started conversation daughter pray never exper...,0.631,0.15775


In [26]:
def getDays(dateStrings, finalDateStr):
    dateFormat = '%Y-%m-%d'
    
    def dateFromStr(dStr):
        return datetime.strptime(dStr, dateFormat)
    
    finalDate = dateFromStr(finalDateStr)   
    return [(finalDate - dateFromStr(dateStr)).days for dateStr in dateStrings]

Remove entries which are during the events as they dont help in predicting (discuss with the team)

In [27]:
# get days
cVille_preprocessed_df['days_to_event'] = getDays(cVille_preprocessed_df['date'], '2017-08-11')
womenMarch_preprocessed_df['days_to_event'] = getDays(womenMarch_preprocessed_df['date'], '2017-01-21')

# get relevant data
cVille_preprocessed_relevant_df = cVille_preprocessed_df[(cVille_preprocessed_df.date != '2017-08-11') & (cVille_preprocessed_df.date != '2017-08-12')]
womenMarch_preprocessed_relevant_df = womenMarch_preprocessed_df[(womenMarch_preprocessed_df.date != '2017-01-21') & (womenMarch_preprocessed_df.date != '2017-01-22')]

In [28]:
womenMarch_preprocessed_relevant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67078 entries, 0 to 109957
Data columns (total 15 columns):
id                  67078 non-null int64
date                67078 non-null object
filteredText        67078 non-null object
hashtags            67078 non-null object
favorites           67078 non-null int64
retweets            67078 non-null int64
mention_count       67078 non-null int64
hashtag_count       67078 non-null int64
punc_excl           67078 non-null int64
punc_ques           67078 non-null int64
violent             67078 non-null int64
preprocessedText    67078 non-null object
vaderTextScore      67078 non-null float64
vaderScore          67078 non-null float64
days_to_event       67078 non-null int64
dtypes: float64(2), int64(9), object(4)
memory usage: 8.2+ MB


In [None]:
# import collections
# collections.Counter(womenMarch_preprocessed_relevant_df['date'])

Feature Extraction

1. Tf-idf vectorizer
2. Subset relevant features in a data frame

In [29]:
cVille_features_df = cVille_preprocessed_relevant_df[['id', 'date', 'days_to_event', 'preprocessedText', 'favorites', 'retweets', 'mention_count', 'hashtag_count', 'punc_excl', 'punc_ques', 'vaderTextScore', 'vaderScore', 'violent']]
womenMarch_features_df = womenMarch_preprocessed_relevant_df[['id', 'date', 'days_to_event', 'preprocessedText', 'favorites', 'retweets', 'mention_count', 'hashtag_count', 'punc_excl', 'punc_ques', 'vaderTextScore', 'vaderScore', 'violent']]

cVille_features_df.info()
womenMarch_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31131 entries, 0 to 153942
Data columns (total 13 columns):
id                  31131 non-null int64
date                31131 non-null object
days_to_event       31131 non-null int64
preprocessedText    31131 non-null object
favorites           31131 non-null int64
retweets            31131 non-null int64
mention_count       31131 non-null int64
hashtag_count       31131 non-null int64
punc_excl           31131 non-null int64
punc_ques           31131 non-null int64
vaderTextScore      31131 non-null float64
vaderScore          31131 non-null float64
violent             31131 non-null int64
dtypes: float64(2), int64(9), object(2)
memory usage: 3.3+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 67078 entries, 0 to 109957
Data columns (total 13 columns):
id                  67078 non-null int64
date                67078 non-null object
days_to_event       67078 non-null int64
preprocessedText    67078 non-null object
favorites    

In [30]:
training_features_df = pd.concat([cVille_features_df, womenMarch_features_df])
training_features_df.head()

Unnamed: 0,id,date,days_to_event,preprocessedText,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,vaderTextScore,vaderScore,violent
0,884925177917845504,2017-07-11,31,’ trip approved ‘ city purpose ’ bashing trump...,0,2,0,2,0,0,0.4215,0.1405,1
1,884924281951576064,2017-07-11,31,alt-right trump-supporting neo-nazi confirmed,0,0,0,0,0,0,0.0,0.0,1
2,884923880938328066,2017-07-11,31,kill remember seeing around rest power comrade,7,4,0,4,0,0,-0.6908,-0.13816,1
3,884923588960280576,2017-07-11,31,resorting violence unrest silence opposition f...,0,0,0,1,0,0,-0.8271,-0.41355,1
4,884922175748059136,2017-07-11,31,neo-nazi july internet-wide day action save,0,1,1,3,0,0,0.4939,0.123475,1


In [31]:
training_features_df.shape

(98209, 13)

In [32]:
# use tf-idf vectorizer
tv = TfidfVectorizer(stop_words='english', min_df=0.0005)
tfidf_features = tv.fit_transform(training_features_df['preprocessedText'])

In [33]:
tfidf_features.shape

(98209, 1726)

In [None]:
# tv.vocabulary_

In [34]:
# create pandas dataframe from tfidf features
tfidf_features_df = pd.DataFrame(tfidf_features.toarray())

In [35]:
# reset indices so that they can be concatenated into one dataframe
temp_training_features_df = training_features_df.reset_index(drop=True)
temp_tfidf_features_df = tfidf_features_df.reset_index(drop=True)

final_training_features_df = pd.concat([temp_training_features_df, temp_tfidf_features_df], axis=1)

In [36]:
final_training_features_df.shape

(98209, 1739)

In [37]:
final_training_features_df.head()

Unnamed: 0,id,date,days_to_event,preprocessedText,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,...,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725
0,884925177917845504,2017-07-11,31,’ trip approved ‘ city purpose ’ bashing trump...,0,2,0,2,0,0,...,0.0,0.0,0.0,0.0,0.0,0.442987,0.0,0.0,0.0,0.0
1,884924281951576064,2017-07-11,31,alt-right trump-supporting neo-nazi confirmed,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,884923880938328066,2017-07-11,31,kill remember seeing around rest power comrade,7,4,0,4,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,884923588960280576,2017-07-11,31,resorting violence unrest silence opposition f...,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,884922175748059136,2017-07-11,31,neo-nazi july internet-wide day action save,0,1,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# drop id, date and preprocessedText from final training set
final_training_features_df.drop(['id', 'date', 'preprocessedText'], axis=1, inplace=True)

In [39]:
final_training_features_df.head()

Unnamed: 0,days_to_event,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,vaderTextScore,vaderScore,violent,...,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725
0,31,0,2,0,2,0,0,0.4215,0.1405,1,...,0.0,0.0,0.0,0.0,0.0,0.442987,0.0,0.0,0.0,0.0
1,31,0,0,0,0,0,0,0.0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31,7,4,0,4,0,0,-0.6908,-0.13816,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31,0,0,0,1,0,0,-0.8271,-0.41355,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31,0,1,1,3,0,0,0.4939,0.123475,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
final_training_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98209 entries, 0 to 98208
Columns: 1736 entries, days_to_event to 1725
dtypes: float64(1728), int64(8)
memory usage: 1.3 GB


Data Exploration and Visualization

Split Train data and Test data 

In [41]:
# shuffle dataset
final_training_features_df = final_training_features_df.sample(frac=1, random_state=np.random.seed(1))
final_training_features_df.head()

Unnamed: 0,days_to_event,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,vaderTextScore,vaderScore,violent,...,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,Unnamed: 22
64050,33,2,1,2,1,0,0,-0.3612,-0.1806,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
33892,-84,3,1,0,3,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
17231,60,1,1,0,0,0,0,-0.296,-0.296,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
34047,-81,2,2,2,2,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
33173,-92,3,2,1,4,1,1,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.442064,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
final_training_features_df.tail()

Unnamed: 0,days_to_event,favorites,retweets,mention_count,hashtag_count,punc_excl,punc_ques,vaderTextScore,vaderScore,violent,...,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,Unnamed: 22,Unnamed: 23,Unnamed: 24
73349,-15,2,0,2,0,2,0,0.296,0.098667,0,0.2960,0.098667,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50057,1,2,0,1,3,0,0,0.6369,0.254675,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
98047,30,9,3,0,1,0,0,0.6705,0.33525,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
5192,38,2,0,0,1,1,0,-0.3089,-0.15445,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
77708,-5,0,0,1,3,0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [43]:
Y = final_training_features_df['violent'] 
X = final_training_features_df.loc[:, final_training_features_df.columns != 'violent']

In [44]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98209 entries, 64050 to 77708
Columns: 1735 entries, days_to_event to 1725
dtypes: float64(1728), int64(7)
memory usage: 1.3 GB


In [45]:
Y.head()

64050    0
33892    0
17231    1
34047    0
33173    0
Name: violent, dtype: int64

Train test split

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [47]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(78567, 1735)
(78567,)
(19642, 1735)
(19642,)


Testing with imbalanced data

Calculate baseline

In [48]:
print(np.mean(Y_train))

0.316646938282


seems like around 68% accuracy could be achieved by simply choosing majority class (not violent).

Logistic Regression

In [49]:
logistic_clf = LogisticRegression()
logistic_score = cross_val_score(logistic_clf, X_train, Y_train, scoring='accuracy', cv=5)
print(np.mean(logistic_score))

0.950106223507


Our model is better than baseline as it is 95% accurate without even balancing our data.

Naive Bayes' Model/Classifier

In [50]:
nb_clf = BernoulliNB()
nb_score = cross_val_score(nb_clf, X_train, Y_train, scoring='accuracy', cv=5)
print(np.mean(nb_score))

0.907734806164


Our model is better than baseline as it gives around 91% accuracy without even balancing our data.

SVM Classifier

In [None]:
# svm_clf = SVC()
# svm_score = cross_val_score(svm_clf, X_train, Y_train, scoring='accuracy', cv=5)
# print(np.mean(svm_score))

Neural Network Classifier

In [51]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier()
nn_score = cross_val_score(nn_clf, X_train, Y_train, scoring='accuracy', cv=5)
print(np.mean(nn_score))

0.959015917063


Our model is clearly better than baseline as it gives around 96% accuracy without balancing our data.

Final Accuracy Test

In [52]:
def evaluate(classifier, xTrain, yTrain, xTest, yTest):
    classifier.fit(xTrain, yTrain)
    yPred = classifier.predict(xTest)
    accuracy = accuracy_score(yTest, yPred)
    confusionMat = confusion_matrix(yTest, yPred)
    return (accuracy, confusionMat)