In [1]:
import numpy as np
import pandas as pd 
import re

In [2]:
train_tweets=pd.read_csv('../data/Dataset 1/clean_train_tweets.csv', encoding="utf-8")
train_tweets

Unnamed: 0,id,label,tweet,length,count
0,1,0,father dysfunctional selfish drags kids dysfun...,55,7
1,2,0,thanks lyft credit use cause offer wheelchair ...,77,11
2,3,0,bihday majesty,14,2
3,4,0,model love u take u time ur,27,7
4,5,0,factsguide society motivation,29,3
...,...,...,...,...,...
31925,31958,0,ate isz youuu,13,3
31926,31959,0,see nina turner airwaves trying wrap mantle ge...,93,14
31927,31960,0,listening sad songs monday morning otw work sad,47,8
31928,31961,1,sikh temple vandalised calgary wso condemns act,47,7


### Create CountVectorizer

In [3]:
#using scikit-learn to transform text into token count vector

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1), #ngram_range (1,1)= only unigrams, (1,2)=unigrams and bigrams, (2,2)=bigrams
    lowercase = True,
    min_df = 1, #min_df=1 is the default, means ignore terms that appear in less than 1 document/text.
    max_df = 1.0 #max_df=1.0 is the default, means ignore terms that appear in more than 100% of the documents/texts.
)

In [4]:
X_train = train_tweets['tweet'].values
X_train_vect = count_vector.fit_transform(X_train) #fitting CountVectorizer, transforms trainging data into 
                                                    #matrix representing token counts 
X_train_vect

<31930x37438 sparse matrix of type '<class 'numpy.int64'>'
	with 235914 stored elements in Compressed Sparse Row format>

In [5]:
X_train[30]

'never chance vote presidential candidate excited cycle looks different'

In [6]:
X_train_vect[30]

<1x37438 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [7]:
#the second values represent the number of times the token appears in the text

id2token = {v:k for k,v in count_vector.vocabulary_.items()}
counter = 0
for i, value in enumerate(X_train_vect[30].toarray()[0]):
    if value != 0:
        print(f"{counter}\t{value}\t{id2token[i]}")
        counter += 1

0	1	candidate
1	1	chance
2	1	cycle
3	1	different
4	1	excited
5	1	looks
6	1	never
7	1	presidential
8	1	vote


### Naive Bayes Model

In [8]:
#to test perfomarnce against the development set, we can split the training dataset into train and dev

from sklearn.model_selection import train_test_split

In [9]:
#15% of train_tweets will be in val
train, dev= train_test_split(train_tweets, test_size=0.15, random_state=42)

In [10]:
train

Unnamed: 0,id,label,tweet,length,count
5561,5569,0,excited saturday fake festival sister bihday s...,63,10
5500,5508,0,folks repoing violence france euro 2016 though...,84,13
25832,25861,0,remaster remaster everywhere,28,3
4480,4484,0,happy positive affirmation,26,3
18879,18903,0,inspired grateful aists painters creative peop...,53,7
...,...,...,...,...,...
29802,29834,0,ahhh hea breaks really seemed like keeper bach...,54,8
5390,5398,0,bread x one million loaves bread math carbs li...,58,10
860,861,1,black professor makes assumptions entire race ...,88,13
15795,15816,0,launch new restaurant tuesday buzzing,37,5


In [11]:
X_train = train['tweet'].values
X_train_vect = count_vector.fit_transform(X_train) #fitting CountVectorizer, transforms trainging data into 
                                                    #matrix representing token counts 
X_train_vect

<27140x34039 sparse matrix of type '<class 'numpy.int64'>'
	with 200746 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.naive_bayes import MultinomialNB


y_train = train['label'].values

clf = MultinomialNB() #clf=classifier 
clf.fit(X_train_vect, y_train)

MultinomialNB()

In [13]:
len(train)

27140

In [14]:
len(dev)

4790

In [15]:
dev

Unnamed: 0,id,label,tweet,length,count
29583,29614,0,tried wedding suit 1st time today much,38,7
2153,2155,0,sent father day gifts today fathersdaygiftidea...,70,8
18511,18535,0,bangkok got 7 shopping cantwaittoseegot 7 omg,45,7
1668,1669,0,lamp head see fragile saw li,28,6
10092,10103,0,3 weeks till interrailing,25,4
...,...,...,...,...,...
22226,22252,0,kwon soon young born day,24,5
18515,18539,1,realitycheck policing america cop audiblechann...,75,7
4749,4753,1,hard believe live world 700 pa hillary thread ...,64,11
31290,31322,0,father day love u papa,22,5


In [16]:
X_dev = dev['tweet'].values
X_dev_vect = count_vector.transform(X_dev) # Note that the vectorizer is already fit, so we only use the transform method.
y_dev = dev['label'].values

In [17]:
#we got 95% accuracy on the dev set
y_pred = clf.predict(X_dev_vect)
np.mean(y_dev==y_pred)

0.9553235908141963

In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_dev, y_pred)  

#confusion matrix TP 4386 -- FP 41  #positives are non_offensive
                # FN 173  -- TN 190  #negatives are offensive
    
#we have more FN than FP

array([[4386,   41],
       [ 173,  190]])

In [19]:
#0=non_offensive, 1=offensive

from sklearn.metrics import classification_report

print(classification_report(y_dev, y_pred))

#the model recalls offensive tweets 52%

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4427
           1       0.82      0.52      0.64       363

    accuracy                           0.96      4790
   macro avg       0.89      0.76      0.81      4790
weighted avg       0.95      0.96      0.95      4790



In [20]:
#the F1_score is 63%

from sklearn.metrics import f1_score

f1_score(y_dev, y_pred)

0.6397306397306397

In [21]:
#checking tweets that are incorrectly classified

idx_error = (y_dev != y_pred) & (y_dev == 1)
print(X_dev[idx_error][1])

pre crime germany places make wear badges


In [22]:
#looking up this sentence in our dataset 

dev['hash'] = dev['tweet'].map(hash) # hash speeds up lookup
dev[dev['hash'] == hash(X_dev[idx_error][1])]

Unnamed: 0,id,label,tweet,length,count,hash
29394,29424,1,pre crime germany places make wear badges,41,7,-4403222008047546842


In [23]:
#let's look at all tweets that are offensive but we classified them as non_offensive
idx_error = (y_dev != y_pred) & (y_dev == 1)
print(X_dev[idx_error])

['deeper look' 'pre crime germany places make wear badges'
 'lady said juniors area ask leave someone complained boss defended'
 'wanna mock country b c pt libtards b c' 'stop annoying'
 'clear good reason hand keys goon yet' 'malia shake ass fine pass blunt'
 'thank mr president obama light revealed darkness people usa stronger'
 'ebonics pa huge con job blacks liberals running america decades actually lack lan'
 'conormcgregor runs mouth endlessly like rondarousey lost yet ppl take glee losing perhaps'
 'neogaf today another attack japanese chinese claims racist'
 'btw gandhi india adores witho'
 'constantly let lack fat male role models especially emojis'
 'jonjo shelvey man ever racially abuse someone without saying anything kickitout shelvey'
 'council staff instructed anc go slow make da look bad exposed weeks ago'
 '2016 year' 'clickbait much really shitty headline change'
 'video men women bed without clothes candid videos sexting'
 'dearfellowwhitepeople 2017 let realize u 2 w

In [24]:
#let's look at all tweets that are non_offensive but we classified them as offensive
idx_error1 = (y_dev != y_pred) & (y_dev == 0)
print(X_dev[idx_error])

['deeper look' 'pre crime germany places make wear badges'
 'lady said juniors area ask leave someone complained boss defended'
 'wanna mock country b c pt libtards b c' 'stop annoying'
 'clear good reason hand keys goon yet' 'malia shake ass fine pass blunt'
 'thank mr president obama light revealed darkness people usa stronger'
 'ebonics pa huge con job blacks liberals running america decades actually lack lan'
 'conormcgregor runs mouth endlessly like rondarousey lost yet ppl take glee losing perhaps'
 'neogaf today another attack japanese chinese claims racist'
 'btw gandhi india adores witho'
 'constantly let lack fat male role models especially emojis'
 'jonjo shelvey man ever racially abuse someone without saying anything kickitout shelvey'
 'council staff instructed anc go slow make da look bad exposed weeks ago'
 '2016 year' 'clickbait much really shitty headline change'
 'video men women bed without clothes candid videos sexting'
 'dearfellowwhitepeople 2017 let realize u 2 w

In [25]:
dev['hash'] = dev['tweet'].map(hash) # hash speeds up lookup
dev[dev['hash'] == hash(X_dev[idx_error1][32])]

Unnamed: 0,id,label,tweet,length,count,hash
6863,6871,0,trump campaign female judges could biased must...,57,9,-2224309791933809150


### Let's look at the prediction probablity 

In [26]:
#predicting probablity of the dev dataset
prob=clf.predict_proba(X_dev_vect)
prob

array([[9.99999808e-01, 1.91854424e-07],
       [9.99999997e-01, 3.34945812e-09],
       [9.99783205e-01, 2.16795199e-04],
       ...,
       [9.99868186e-01, 1.31814486e-04],
       [9.99999943e-01, 5.71438941e-08],
       [1.00000000e+00, 2.30355653e-11]])

In [27]:
#saving it as a dataframe
prob=pd.DataFrame(prob)
prob

Unnamed: 0,0,1
0,1.000000,1.918544e-07
1,1.000000,3.349458e-09
2,0.999783,2.167952e-04
3,0.999688,3.121197e-04
4,0.998074,1.925705e-03
...,...,...
4785,0.999987,1.285223e-05
4786,0.000058,9.999423e-01
4787,0.999868,1.318145e-04
4788,1.000000,5.714389e-08


In [28]:
#seting the threshold to 20%-80%
prob[(prob[0].between(0.2, 0.8))]

Unnamed: 0,0,1
40,0.235848,0.764152
98,0.482281,0.517719
172,0.249059,0.750941
180,0.492476,0.507524
210,0.552824,0.447176
...,...,...
4653,0.705625,0.294375
4683,0.254677,0.745323
4688,0.792386,0.207614
4703,0.767220,0.232780


In [29]:
prob1=prob[(prob[0].between(0.2, 0.8))].reset_index(drop=False)
prob1 

Unnamed: 0,index,0,1
0,40,0.235848,0.764152
1,98,0.482281,0.517719
2,172,0.249059,0.750941
3,180,0.492476,0.507524
4,210,0.552824,0.447176
...,...,...,...
131,4653,0.705625,0.294375
132,4683,0.254677,0.745323
133,4688,0.792386,0.207614
134,4703,0.767220,0.232780


In [30]:
tweet_list=prob1['index'].unique()
tweet_list

array([  40,   98,  172,  180,  210,  221,  222,  231,  318,  325,  333,
        379,  422,  557,  624,  651,  654,  670,  671,  855,  923,  932,
        935,  961,  963,  974,  981, 1038, 1068, 1099, 1174, 1232, 1381,
       1455, 1495, 1497, 1569, 1667, 1684, 1706, 1709, 1723, 1839, 1841,
       1857, 1909, 1912, 1931, 1940, 1945, 1969, 2027, 2054, 2089, 2095,
       2112, 2131, 2182, 2192, 2196, 2229, 2256, 2319, 2351, 2475, 2481,
       2489, 2569, 2597, 2609, 2671, 2698, 2724, 2759, 2776, 2780, 2793,
       2804, 2810, 2816, 2823, 2846, 2851, 2886, 2930, 2993, 2999, 3075,
       3079, 3107, 3114, 3162, 3168, 3185, 3222, 3255, 3257, 3274, 3326,
       3345, 3353, 3369, 3386, 3412, 3439, 3452, 3455, 3471, 3506, 3524,
       3654, 3661, 3720, 3803, 3829, 3840, 3845, 3897, 3902, 3929, 3938,
       4083, 4136, 4305, 4368, 4426, 4516, 4525, 4548, 4636, 4643, 4653,
       4683, 4688, 4703, 4728])

In [31]:
dev.iloc[tweet_list].shape

(136, 6)

In [32]:
pd.set_option('display.max_row', None)

In [33]:
dev.iloc[tweet_list]

Unnamed: 0,id,label,tweet,length,count,hash
3058,3060,1,beware snowball effect blackman pa 2,36,6,3181021304848810750
15341,15360,1,saw vid dude spoke arabic cell delta got throw...,66,12,-4478650857959348733
15296,15315,1,ha good riddance blacklivesmatter,33,4,-4428535383774492374
23787,23814,0,trumpuniversity video leaked neverump shysteru...,83,8,-4130562399996542792
8910,8920,0,jo cox attack sign politicians rethink tactics...,98,13,6067171499209098013
31312,31344,1,malia shake ass fine pass blunt,31,6,20268119049803256
29590,29621,0,ratings liar,12,2,-2256569978970610808
12528,12543,0,hell right fire hire,20,4,786212475720502775
3820,3823,0,treasures wisconsin libraries museums historic...,71,8,-2149292836390734289
28060,28089,0,getting appropriate response justicereform lov...,58,5,1352711997545609269


In [34]:
dev.iloc[tweet_list]['tweet']

3058                  beware snowball effect blackman pa 2
15341    saw vid dude spoke arabic cell delta got throw...
15296                    ha good riddance blacklivesmatter
23787    trumpuniversity video leaked neverump shysteru...
8910     jo cox attack sign politicians rethink tactics...
31312                      malia shake ass fine pass blunt
29590                                         ratings liar
12528                                 hell right fire hire
3820     treasures wisconsin libraries museums historic...
28060    getting appropriate response justicereform lov...
499      thats problem one police officer need bigger g...
28022                        btw gandhi india adores witho
11830              brainer gordie howe half billion donate
15593                         kudos compliant itaxnirahisi
25129    12 31 3 pres emperor sans clothes neveoolate s...
9872     great see quebec establishing provincial inqui...
24919                         badger new kennel toy badg

In [35]:
prob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4790 entries, 0 to 4789
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4790 non-null   float64
 1   1       4790 non-null   float64
dtypes: float64(2)
memory usage: 75.0 KB


In [36]:
prob.shape

(4790, 2)

In [37]:
len(prob)

4790

In [38]:
X_dev

array(['tried wedding suit 1st time today much',
       'sent father day gifts today fathersdaygiftideas mydadmyhero fathersday',
       'bangkok got 7 shopping cantwaittoseegot 7 omg', ...,
       'hard believe live world 700 pa hillary thread trending tony hawk',
       'father day love u papa',
       'tupacs bihday bihday well play thugs mansion tupac bihday tupac shakur yr dearly missed'],
      dtype=object)

In [39]:
dev

Unnamed: 0,id,label,tweet,length,count,hash
29583,29614,0,tried wedding suit 1st time today much,38,7,6529283096253199529
2153,2155,0,sent father day gifts today fathersdaygiftidea...,70,8,-889216810102385225
18511,18535,0,bangkok got 7 shopping cantwaittoseegot 7 omg,45,7,-3859411687113833849
1668,1669,0,lamp head see fragile saw li,28,6,1463817198552713395
10092,10103,0,3 weeks till interrailing,25,4,8605232540353402645
8721,8731,0,ugh day things going feel easier,32,6,-7140175908776842827
17728,17752,0,day nutella shit girls instapic,31,5,4186308088417044969
28172,28201,0,doubt always see famous actor robe pattinson p...,59,9,857523652690725161
624,625,0,people like already forgot nah new names made ...,66,11,3944206908367683971
31710,31742,0,dont want world like homophonia phobie asshole...,77,10,-514105890235239854


In [40]:
dev[dev['id']==4692]

Unnamed: 0,id,label,tweet,length,count,hash
4688,4692,0,ariana grande amazing wait follower life,40,6,-3178694523687911344


### Hyperparameter tuning using GridSearchCV, TfidfVectorizer, RandomizedSearchCV

In [41]:
#setting up a PredefinedSplit

X_train = train['tweet'].values
y_train = train['label'].values

X_dev = dev['tweet'].values
y_dev = dev['label'].values

X = np.hstack([X_train, X_dev])
y = np.hstack([y_train, y_dev])

In [42]:
y_train.shape

(27140,)

In [43]:
y_dev.shape

(4790,)

In [44]:
#assign 0 to items that are in dev and -1 for the rest
split_train_dev= np.zeros(shape=y.shape)
split_train_dev[:y_train.shape[0]] = -1
pd.value_counts(split_train_dev)

-1.0    27140
 0.0     4790
dtype: int64

### GridSearchCV

In [45]:
from sklearn.model_selection import PredefinedSplit, GridSearchCV

ps = PredefinedSplit(split_train_dev)

In [46]:
from sklearn.pipeline import Pipeline

#we had used this code before
vect  = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

clf = MultinomialNB()

pipe = Pipeline([("vect", vect ), ("clf", clf)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(token_pattern='[a-z]+')),
                ('clf', MultinomialNB())])

In [47]:
#looking at the first 10 vectors
pipe.predict(X_dev[:10])


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
#defining a parameter range to charactrize using GridSearchCV and Pipeline

param_grid = {
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

#GridSearchCV keeps track of the parameters and F-1 score, reports best performing combination
#note that each iteration overwrites the previous configuration 

gs = GridSearchCV(pipe, param_grid, scoring='f1', n_jobs=6, cv=ps, verbose=2)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)

Fitting 1 folds for each of 30 candidates, totalling 30 fits
{'clf__fit_prior': False, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}
0.6816720257234727


In [49]:
y_pred = gs.best_estimator_.predict(X_dev)
confusion_matrix(y_dev, y_pred)

#this confusion matrix is from the cells above
                # TP 4386 -- FP 41  #positives are non_offensive
                # FN 173  -- TN 190  #negatives are offensive
    
#in the matrix below, we have 11 FN and 88 FP and seems that the model performs better

array([[4339,   88],
       [  11,  352]])

In [50]:
#the model recalls offensive tweets 97%

print(classification_report(y_dev, y_pred))
np.mean(y_dev==y_pred)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      4427
           1       0.80      0.97      0.88       363

    accuracy                           0.98      4790
   macro avg       0.90      0.97      0.93      4790
weighted avg       0.98      0.98      0.98      4790



0.9793319415448852

### TfidfVectorizer 

#### Tfidf reduces the weight of high frequency words and increase the weight of less frequent words (inverse document frequency, IDF)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect_1 = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

vect_2 = TfidfVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

clf = MultinomialNB()

pipe = Pipeline([("vect", vect), ("clf", clf)])

In [None]:
param_grid = {
    'vect':[vect_1, vect_2],
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

gs = GridSearchCV(pipe, param_grid, scoring='f1', n_jobs=6, cv=ps, verbose=2)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)

Fitting 1 folds for each of 60 candidates, totalling 60 fits


In [None]:
y_pred = gs.best_estimator_.predict(X_dev)
confusion_matrix(y_dev, y_pred)

     #Naive Bayes     #TP 4386 -- FP 41  #positives are non_offensive
                      # FN 173 -- TN 190  #negatives are offensive
        
    #GridSearchCV     #TP 4339 -- FP 88  #positives are non_offensive
                      # FN 11  -- TN 352  #negatives are offensive        

In [None]:
#the model recalls offensive tweets 97% of times

print(classification_report(y_dev, y_pred))
np.mean(y_dev==y_pred)

### RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2

vect_1 = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

vect_2 = TfidfVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1),
    lowercase = True,
    min_df = 1,
    max_df = 1.0
)

select = SelectPercentile(score_func=chi2)

clf = MultinomialNB()

pipe = Pipeline([("vect", vect), ("select", select), ("clf", clf)])

In [None]:
param_grid = {
    'vect':[vect_1, vect_2],
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'select__percentile':[1, 2, 5, 10, 20, 50],
    'clf__fit_prior':[False, True]
}

rs = RandomizedSearchCV(pipe, param_grid, n_iter=30, scoring='f1', n_jobs=6, cv=ps, verbose=2)
rs.fit(X, y)
print(rs.best_params_)
print(rs.best_score_)

In [None]:
y_pred = rs.best_estimator_.predict(X_dev)
confusion_matrix(y_dev, y_pred)


     #Naive Bayes        # TP 4386 -- FP 41  #positives are non_offensive
                         # FN 173   -- TN 190  #negatives are offensive
        
    #GridSearchCV        # TP 4386 -- FP 88  #positives are non_offensive
                         # FN 11   -- TN 352  #negatives are offensive  
        
    #RandomizedSearchCV  # TP 4319 -- FP 108  #positives are non_offensive
                         # FN 6    -- TN 357  #negatives are offensive 

In [None]:
#the model recalls offensive tweets 98% of times

print(classification_report(y_dev, y_pred))
np.mean(y_dev==y_pred)