## Common Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the data

In [2]:
train_data = pd.read_pickle('pickle_files/train_data_formatted.pickle')
eval_data = pd.read_pickle('pickle_files/eval_data_formatted.pickle')
all_data = train_data.append(eval_data)
all_data['Tweet ID'] = all_data['Tweet ID'].astype(str) #change the ID to str to avoid potential issues during aggregation
all_data = all_data.reset_index(drop=True)

## Load Stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
all_data['Tweet_original'] = all_data.Tweet.copy() #keep a copy of the original tweet text
all_data['Tweet'] = all_data['Tweet'].str.lower() #lowercase the text 

## Remove punctuation, usernames, hashtags, URLs

In [5]:
all_data['Tweet'] = all_data['Tweet'].fillna('')
p = re.compile(r'[^\w\s]+')
all_data['Tweet'] = [p.sub('', x) for x in all_data['Tweet'].tolist()] #remove the punctuation
for i in all_data.index:
    #print(i)
    #all_data.loc[i,'Tweet'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("@[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("http[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
    all_data.loc[i,'Tweet'] =re.sub("#[A-Za-z0-9_/:().]+",  "", all_data.loc[i,'Tweet'])
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Remove stopwords

In [6]:
all_data['Tweet'] = all_data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Tokenize

In [7]:
all_data['Tweet'] = all_data['Tweet'].apply(word_tokenize) #tokenize the text
print(len(np.unique(all_data['Tweet ID']))) #for convenience

5104


## Build tagged Gensim documents

In [8]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tweet_docs = [TaggedDocument(doc, [i]) for i, doc in zip(all_data['Tweet ID'],all_data['Tweet'])]

## Build Gensim model

In [9]:
import gensim
vec_size = 200
model = gensim.models.doc2vec.Doc2Vec(vector_size=vec_size, min_count=2, epochs=100)
model.build_vocab(tweet_docs)

  "C extension not loaded, training will be slow. "


## Train model

In [10]:
%time model.train(tweet_docs, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 26min 43s


## Extract the vectors

In [11]:
tweet_ids = [] #empty array to hold tweet ids
tweet_vectors = [] #empty array to hold document vectors
for t_id in all_data['Tweet ID']: #zip through tweets
    vector = model.docvecs[t_id] #get vector for each tweet
    tweet_ids.append(t_id)
    tweet_vectors.append(vector)



## Create Train and Eval

In [12]:
from sklearn.preprocessing import scale
tweet_df = pd.DataFrame(data=tweet_vectors,
          index=np.array(range(0, len(tweet_vectors))),
          columns=np.array(range(0, vec_size)))
feature_cols = tweet_df.columns
tweet_df['tweet_id'] = tweet_ids
tweet_df['class'] = all_data['class']
tweet_df_train = tweet_df[tweet_df['tweet_id'].isin(train_data['Tweet ID'].astype(str))]
tweet_df_eval = tweet_df[tweet_df['tweet_id'].isin(eval_data['Tweet ID'].astype(str))]

x_train = scale(tweet_df_train[feature_cols])
y_train = tweet_df_train['class']

x_eval = scale(tweet_df_eval[feature_cols])
y_eval = tweet_df_eval['class']



## Grid search, training, and evaluation

In [13]:
from sklearn.linear_model import LogisticRegression #import lr
from sklearn.svm import SVC #import svm
from sklearn.tree import DecisionTreeClassifier #import dt
from sklearn.ensemble import RandomForestClassifier #import rf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics
from sklearn.model_selection import GridSearchCV #grid search
log_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

param_grid = [{'random_state':[42],
       'C':[0.05,0.1,0.5,1],
       'penalty':['l1','l2']}]

classifier = log_clf

param_grid = param_grid
grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall') #grid search using 10-folds cross validation
grid_search.fit(x_train, y_train) #fir grid search
print("")
print('Best parameters')
best_parameters = grid_search.best_params_
print(best_parameters) #print best parameters from grid search
print('Best grid search score = ',grid_search.best_score_) #print best grid search score
print("")
print('Evaluation data scores')
tuned_clf = grid_search.best_estimator_ #build model using best parameters
tuned_clf_pred = tuned_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,tuned_clf_pred) #build confusion matrix
precision = precision_score(y_eval,tuned_clf_pred) #calculate precision
recall = recall_score(y_eval,tuned_clf_pred) #calculate recall
f1 = f1_score(y_eval,tuned_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,tuned_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,tuned_clf_pred) #calculate accuracy
#class_eval['pred'] = tuned_clf_pred
#class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))


Best parameters
{'C': 1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.5977728951185969

Evaluation data scores
[[796  35]
 [ 83 107]]
precision = 0.7535211267605634
recall = 0.5631578947368421
f1 = 0.6445783132530121
auc = 0.7605199822661347
accuracy = 0.8844270323212536


## Build dataframes of results

In [27]:
b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())

tf = ['doc2vec']     #initialise empty vectors to hold results
name = ['lr']
bp = [b]
tn = [conf_matrix[0][0]]
fp = [conf_matrix[0][1]]
fn = [conf_matrix[1][0]]
tp = [conf_matrix[1][1]]
p = [precision]
r = [recall]
f_1 = [f1]
auc_sc = [auc_score]
acc = [accuracy]

classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

df_per_tweet = pd.DataFrame({'file':np.repeat('doc2vec',len(eval_data['Tweet ID'])),
                             'model':np.repeat('lr',len(eval_data['Tweet ID'])),
                             'pred': tuned_clf_pred,
                             'tweet_id': eval_data['Tweet ID']
                            })

## Append results to existing results

In [32]:
metrics_local = pd.read_pickle('results/metrics_local.pickle')
metrics_local = metrics_local.append(classifications)
metrics_local.to_pickle('results/metrics_local.pickle') #pickle the metrics

preds_per_tweet_local = pd.read_pickle('results/preds_per_tweet_local.pickle')
preds_per_tweet_local = preds_per_tweet_local.append(df_per_tweet)
preds_per_tweet_local.to_pickle('results/preds_per_tweet_local.pickle') #pickle the results of actual vs predicted for each tweet


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
