## Common Imports

In [12]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
metrics_local = pd.read_pickle('results/metrics_local.pickle')
print(metrics_local.shape)
metrics_local = metrics_local[~metrics_local.tf.isin(['doc2vec exc_calculated','doc2vec inc_calculated'])]
print(metrics_local.shape)

(50, 12)
(41, 12)


metrics_local.to_pickle('results/metrics_local.pickle') #pickle the metrics

## Load the data

In [14]:
train_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
eval_data = pd.read_pickle('pickle_files/eval_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)

calculated_features = pd.read_pickle('features/count_features/count_features.pickle') #load calculated features
calculated_features = calculated_features.pivot(index='tweet_ids', columns='feature', values='value').reset_index() #pivot
calc_feats = ['ave_chars_token', 'caps_count', 'followers_count', 'following_count', #drop columns
             'mention_count', 'neg_sent', 'neu_sent', 'pos_sent',
             'posted_tweets_count', 'punctuation_count', 'quotes_count', 'url_count',
             'tweet_ids']
calculated_features = calculated_features[calc_feats]
calculated_features['tweet_ids'] = calculated_features['tweet_ids'].astype(str) #change the ID to str to avoid potential issues during aggregation

print(all_data.shape)
print(calculated_features.shape)

(5104, 16)
(5104, 13)


## Load Stopwords

In [15]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [16]:
all_data['Tweet_original'] = all_data.Tweet.copy() #keep a copy of the original tweet text
all_data['Tweet'] = all_data['Tweet'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [17]:
all_data['Tweet'] = all_data['Tweet'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Tweet'] = [p.sub('', x) for x in all_data['Tweet'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Tweet'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Remove stopwords

In [18]:
all_data['Tweet'] = all_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Tokenize

In [19]:
all_data['Tweet'] = all_data['Tweet'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Build tagged Gensim documents

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tweet_docs = [TaggedDocument(doc, [i]) for i, doc in zip(all_data['Tweet ID'],all_data['Tweet'])]

## Build Gensim model

In [21]:
import gensim
vec_size = 300
model = gensim.models.doc2vec.Doc2Vec(vector_size=vec_size, min_count=2, epochs=100)
model.build_vocab(tweet_docs)

  "C extension not loaded, training will be slow. "


## Train model

In [22]:
%time model.train(tweet_docs, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 21min 10s


## Extract the vectors

In [23]:
tweet_ids = [] #empty array to hold tweet ids
tweet_vectors = [] #empty array to hold document vectors
for t_id in all_data['Tweet ID']: #zip through tweets
    vector = model.docvecs[t_id] #get vector for each tweet
    tweet_ids.append(t_id)
    tweet_vectors.append(vector)



## Create Train and Eval

In [24]:
from sklearn.preprocessing import scale
tweet_df = pd.DataFrame(data=tweet_vectors,                  #convert the document vectors into a pd dataframe
          index=np.array(range(0, len(tweet_vectors))),
          columns=np.array(range(0, vec_size)))
feature_cols = tweet_df.columns.tolist()
tweet_df['tweet_id'] = tweet_ids
tweet_df['class'] = all_data['class']

#merge in the calculated features
tweet_df = pd.merge(tweet_df, calculated_features, left_on='tweet_id',right_on='tweet_ids')
tweet_df = tweet_df.drop(['tweet_ids'], axis=1)

#create train and eval
tweet_df_train = tweet_df[tweet_df['tweet_id'].isin(train_data['Tweet ID'].astype(str))]
tweet_df_eval = tweet_df[tweet_df['tweet_id'].isin(eval_data['Tweet ID'].astype(str))]

tweet_classes = tweet_df_eval.loc[:,['tweet_id','class_column']]

calc_feats.remove('tweet_ids') #drop 'tweet_ids from list of calculated features
feature_cols = feature_cols + calc_feats #add calc feats to list of included columns

x_train = tweet_df_train[feature_cols]
print(x_train.shape)
x_train_nc = x_train.drop(calc_feats, axis=1)
print(x_train_nc.shape)

x_train = scale(x_train)
x_train_nc = scale(x_train_nc)
y_train = tweet_df_train['class']

x_eval = tweet_df_eval[feature_cols]
print(x_eval.shape)
x_eval_nc = x_eval.drop(calc_feats, axis=1)
print(x_eval_nc.shape)

x_eval = scale(x_eval)
x_eval_nc = scale(x_eval_nc)
y_eval = tweet_df_eval['class']



(4083, 312)
(4083, 300)
(1021, 312)
(1021, 300)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


## Create classifiers and import metrics

In [25]:
from sklearn.linear_model import LogisticRegression #import lr
from sklearn.svm import SVC #import svm
from sklearn.tree import DecisionTreeClassifier #import dt
from sklearn.ensemble import RandomForestClassifier #import rf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics
from sklearn.model_selection import GridSearchCV #grid search
log_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

## Function for fine tuning and evaluating classifiers

In [26]:
def search_grid(classifier,model, x_train, y_train, x_eval, y_eval, tweet_classes):
    
    if model == 'lr': #if using logisitic regression
        param_grid = [{'random_state':[42],
               'C':[0.05,0.1,0.5,1],
               'penalty':['l1','l2']}]
        
    if model == 'dt': #if using decision tree
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}]
        
    if model == 'rf': #if using random forest
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}] 
    
    if model == 'svm': #if using svm
        param_grid = [{'random_state':[42],
                   'C':[0.05,0.1,1,10], 
                   'kernel':['linear','rbf']}]
    
   
    param_grid = param_grid
    grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall') #grid search using 10-folds cross validation
    grid_search.fit(x_train, y_train) #fir grid search
    print("")
    print('Best parameters')
    best_parameters = grid_search.best_params_
    print(best_parameters) #print best parameters from grid search
    print('Best grid search score = ',grid_search.best_score_) #print best grid search score
    print("")
    print('Evaluation data scores')
    tuned_clf = grid_search.best_estimator_ #build model using best parameters
    tuned_clf_pred = tuned_clf.predict(x_eval) #predict using evaluation data with best parameters
    conf_matrix = confusion_matrix(y_eval,tuned_clf_pred) #build confusion matrix
    precision = precision_score(y_eval,tuned_clf_pred) #calculate precision
    recall = recall_score(y_eval,tuned_clf_pred) #calculate recall
    f1 = f1_score(y_eval,tuned_clf_pred) #calculate f1
    fpr, tpr, thresholds = roc_curve(y_eval,tuned_clf_pred)
    auc_score = auc(fpr, tpr) #calculate auc
    accuracy = accuracy_score(y_eval,tuned_clf_pred) #calculate accuracy
    class_eval = tweet_classes.copy()
    class_eval['pred'] = tuned_clf_pred
    class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
    print(conf_matrix)
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    print('f1 = ' + str(f1))
    print('auc = ' + str(auc_score))
    print('accuracy = ' + str(accuracy))
    
    return(best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval) #return metrics and pred vs actuals for each tweet

## Run the "search_grid()" function for lr, dt, rf, and svm

In [27]:
from datetime import datetime
current = datetime.now() #for checking duration

tf = []     #initialise empty vectors to hold results
name = []
bp = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

tweet_classes = tweet_df_eval.loc[:,['tweet_id','class_column']]

classifiers = [log_clf, dt_clf, rf_clf, svc_clf] #the classifiers that are to be tested
models = ['lr','dt','rf','svm'] #labels for identifying the results

representation = ['inc_calculated','exc_calculated']
train_files_x = [x_train,x_train_nc]
train_files_y = [y_train,y_train]

eval_files_x = [x_eval,x_eval_nc]
eval_files_y = [y_eval,y_eval]

i=1 #a counter to be used for checking loop number
for classifier, model in zip(classifiers, models): #zip through the classifiers and model names
    for rep,tx,ty,ex,ey in zip(representation,train_files_x, train_files_y, eval_files_x, eval_files_y): #zip through the training and evaluation combos
        #execute the search_grid() function
        best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier,model, tx,ty,ex,ey, tweet_classes)
        
        #append the latest results to the vectors
        tf = np.append(tf,'doc2vec ' + rep)
        name = np.append(name,model)
        b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
        bp = np.append(bp,b)
        tn = np.append(tn,conf_matrix[0][0])
        fp = np.append(fp,conf_matrix[0][1])
        fn = np.append(fn,conf_matrix[1][0])
        tp = np.append(tp,conf_matrix[1][1])
        p = np.append(p,precision)
        r = np.append(r,recall)
        f_1 = np.append(f_1,f1)
        auc_sc = np.append(auc_sc,auc_score)
        acc = np.append(acc,accuracy)
        
        #col = train_file+'_'+model #build a column name
        #class_eval.columns = ['tweet_id',col] #rename the columns
        class_eval['model'] = model
        class_eval['file'] = 'doc2vec ' + rep
        if i==1: #if we are on the first iteration of the loop
            df = class_eval.copy()
        else: #if we are not on the first iteration f the loop
            #df = pd.merge(df, class_eval, on='tweet_id')
            df = df.append(class_eval)
        
        i = i+1 #increment i

print('time taken = ',datetime.now() - current) #print the time taken

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)



Best parameters
{'C': 1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.7301771887187023

Evaluation data scores
[[800  31]
 [ 43 147]]
precision = 0.8258426966292135
recall = 0.7736842105263158
f1 = 0.7989130434782608
auc = 0.8681898790297041
accuracy = 0.9275220372184133

Best parameters
{'C': 1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.6008386673637409

Evaluation data scores
[[798  33]
 [ 78 112]]
precision = 0.7724137931034483
recall = 0.5894736842105263
f1 = 0.6686567164179105
auc = 0.7748812464373931
accuracy = 0.89128305582762

Best parameters
{'criterion': 'entropy', 'random_state': 42}
Best grid search score =  0.6300414829782023

Evaluation data scores
[[768  63]
 [ 72 118]]
precision = 0.6519337016574586
recall = 0.6210526315789474
f1 = 0.6361185983827493
auc = 0.7726201786053583
accuracy = 0.8677766895200784

Best parameters
{'criterion': 'entropy', 'random_state': 42}
Best grid search score =  0.5546204713728594

Evaluation data 

## Take a look at the results so far

In [28]:
classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

classifications.sort_values(by='f_1', ascending=False)

Unnamed: 0,tf,name,bp,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
0,doc2vec inc_calculated,lr,C 1;penalty l2;random_state 42,800.0,31.0,43.0,147.0,0.825843,0.773684,0.798913,0.86819,0.927522
6,doc2vec inc_calculated,svm,C 10;kernel linear;random_state 42,776.0,55.0,41.0,149.0,0.730392,0.784211,0.756345,0.859013,0.905975
7,doc2vec exc_calculated,svm,C 10;kernel linear;random_state 42,784.0,47.0,69.0,121.0,0.720238,0.636842,0.675978,0.790142,0.886386
1,doc2vec exc_calculated,lr,C 1;penalty l2;random_state 42,798.0,33.0,78.0,112.0,0.772414,0.589474,0.668657,0.774881,0.891283
2,doc2vec inc_calculated,dt,criterion entropy;random_state 42,768.0,63.0,72.0,118.0,0.651934,0.621053,0.636119,0.77262,0.867777
4,doc2vec inc_calculated,rf,criterion entropy;random_state 42,816.0,15.0,109.0,81.0,0.84375,0.426316,0.566434,0.704133,0.87855
5,doc2vec exc_calculated,rf,criterion entropy;random_state 42,821.0,10.0,111.0,79.0,0.88764,0.415789,0.566308,0.701878,0.881489
3,doc2vec exc_calculated,dt,criterion entropy;random_state 42,746.0,85.0,92.0,98.0,0.535519,0.515789,0.525469,0.706752,0.826641


## Ensemble classifier

In [29]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression(penalty='l2',C=1, random_state=42) #logistic regression with best hyperparameters
svc_clf = SVC(C=10, kernel='linear', probability = True, random_state=42) #svm with best hyperparameters
dt_clf = DecisionTreeClassifier(criterion='entropy', random_state=42) #random forest with best hyperparameters
rf_clf = RandomForestClassifier(criterion='entropy', random_state=42) #random forest with best hyperparameters

#create the ensemble
e_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svc_clf), ('rf', rf_clf), ('dt', dt_clf)],
                         voting='soft', weights=[1, 1, 1, 1])

e_clf = e_clf.fit(x_train, y_train) #fit the ensemble

e_clf_pred = e_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,e_clf_pred) #build confusion matrix
precision = precision_score(y_eval,e_clf_pred) #calculate precision
recall = recall_score(y_eval,e_clf_pred) #calculate recall
f1 = f1_score(y_eval,e_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,e_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,e_clf_pred) #calculate accuracy
class_eval = tweet_classes.copy()
class_eval['pred'] = e_clf_pred
class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

#append the latest results to the vectors
tf = np.append(tf,'doc2vec inc_calculated')
name = np.append(name,'ensemble (lr,svc,dt,rf)')
bp = np.append(bp,'ensemble')
tn = np.append(tn,conf_matrix[0][0])
fp = np.append(fp,conf_matrix[0][1])
fn = np.append(fn,conf_matrix[1][0])
tp = np.append(tp,conf_matrix[1][1])
p = np.append(p,precision)
r = np.append(r,recall)
f_1 = np.append(f_1,f1)
auc_sc = np.append(auc_sc,auc_score)
acc = np.append(acc,accuracy)

class_eval['model'] = 'ensemble (lr,svc,dt,rf)'
class_eval['file'] = 'doc2vec inc_calculated'

df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier
        

[[812  19]
 [ 59 131]]
precision = 0.8733333333333333
recall = 0.6894736842105263
f1 = 0.7705882352941177
auc = 0.8333048324783078
accuracy = 0.9236043095004897


  if diff:


## Append results to existing results

In [30]:
classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

classifications.sort_values(by='f_1', ascending=False)

metrics_local = pd.read_pickle('results/metrics_local.pickle')
metrics_local = metrics_local.append(classifications)
metrics_local.to_pickle('results/metrics_local.pickle') #pickle the metrics

preds_per_tweet_local = pd.read_pickle('results/preds_per_tweet_local.pickle')
preds_per_tweet_local = preds_per_tweet_local.append(df)
preds_per_tweet_local.to_pickle('results/preds_per_tweet_local.pickle') #pickle the results of actual vs predicted for each tweet
