## Common Imports

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from datetime import datetime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!pip install gcsfs #google cloud storage

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/0297f2813cb240c52e90f8587420149970565800e019e1b08ef5ad28b6d9/gcsfs-0.3.1.tar.gz (43kB)
[K     |███████▋                        | 10kB 14.6MB/s eta 0:00:01[K     |███████████████▏                | 20kB 2.3MB/s eta 0:00:01[K     |██████████████████████▊         | 30kB 3.3MB/s eta 0:00:01[K     |██████████████████████████████▎ | 40kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.2MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Created wheel for gcsfs: filename=gcsfs-0.3.1-py2.py3-none-any.whl size=17936 sha256=d58606cb5fec694f2644eb212c23ffba3820c260048f243cd6d22257515ccc64
  Stored in directory: /root/.cache/pip/wheels/9d/2b/6f/86954f0d8caa1173841e62bb780dc0f8693bd268e04a267682
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.3.1


In [3]:
!pip install chakin #word embeddings

Collecting chakin
  Downloading https://files.pythonhosted.org/packages/ca/3f/ca2f63451c0ab47970a6ab1d39d96118e70b6e73125529cea767c31368a3/chakin-0.0.8-py3-none-any.whl
Installing collected packages: chakin
Successfully installed chakin-0.0.8


## Obtain zipped word embeddings from chakin

In [4]:
import chakin
chakin.search(lang='English')
chakin.download(number=16, save_dir='/tmp/') # select GloVe.840B.300d

                   Name  Dimension  ... Language    Author
2          fastText(en)        300  ...  English  Facebook
11         GloVe.6B.50d         50  ...  English  Stanford
12        GloVe.6B.100d        100  ...  English  Stanford
13        GloVe.6B.200d        200  ...  English  Stanford
14        GloVe.6B.300d        300  ...  English  Stanford
15       GloVe.42B.300d        300  ...  English  Stanford
16      GloVe.840B.300d        300  ...  English  Stanford
17    GloVe.Twitter.25d         25  ...  English  Stanford
18    GloVe.Twitter.50d         50  ...  English  Stanford
19   GloVe.Twitter.100d        100  ...  English  Stanford
20   GloVe.Twitter.200d        200  ...  English  Stanford
21  word2vec.GoogleNews        300  ...  English    Google

[12 rows x 7 columns]


Test: 100% ||                                      | Time:  0:16:53   2.0 MiB/s


'/tmp/glove.840B.300d.zip'

## Unzip the word embeddings file

In [0]:
from zipfile import ZipFile
with ZipFile('/tmp/glove.840B.300d.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
  zipObj.extractall()


## Function for loading the word embeddings file

In [0]:
#code adapted from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

## Load the word embeddings file

In [7]:
w = loadGloveModel('glove.840B.300d.txt')

Loading Glove Model
Done. 2196016  words loaded!


## Upload the Tweets pickle files

In [0]:
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files//train_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files/eval_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/features/count_features/count_features.pickle?raw=true

## Function for transforming tweets to word embedding arrays

In [0]:
def embedding(df,length):
    tokens = df.loc[:,['Tweet ID','Tweet','class']] #slice the text and tweet id
    p = re.compile(r'[^\w\s]+')
    tokens['Tweet'] = [p.sub('', x) for x in tokens['Tweet'].tolist()] #remove the punctuation
    tokens['Tweet'] = tokens['Tweet'].apply(word_tokenize) #tokenize the text
    tokens = tokens.reset_index()

    seq_sizes = [] #initialize a list to hold length in tokens of each tweet
    for i in tokens.index:
        seq_sizes = np.append(seq_sizes, len(tokens.Tweet[i])) #get all the tweet token lengths
    max_len = int(round(np.percentile(seq_sizes,100))) #find the number of tokens equal to the 100th percentile of token lengths
    #max_len = max_len//4*4 #find the nearest number divisible by four
    print(max_len)

    vector_size = length #this is the length of the pretrained word embeddings
    vec_array = np.zeros(tokens.shape[0]*max_len*vector_size) #initialize an array to hold enbeddings
    #vec_array = np.zeros(1000*max_len*vector_size) #initialize an array to hold enbeddings
    id_array = [] #initialize an array for the tweetd ids
    labels = [] #initialize an array for the labels
    missing_tokens = [] #initialize an array to hold missing tokens
    found_tokens = [] #initialize an array to hold missing tokens
    for i in range(0,tokens.shape[0]): #loop through each tweet
    #for i in range(0,1000): #loop through each tweet
        id_array = np.append(id_array,tokens['Tweet ID'][i])
        labels = np.append(labels,tokens['class'][i])
        if i%500==0:
            print(i) #print progress
        for j in range(0,max_len): #loop through each token
            try:
                w2v = w[tokens.Tweet[i][j]] #check if the token has a pretrained embedding
                found_tokens = np.append(found_tokens,tokens.Tweet[i][j])
                for k in range(0,vector_size):
                    position = (i*max_len*vector_size)+(j*vector_size)+k
                    vec_array[position] = w2v[k] #loop through each element of the token's embedding and add to vec_array
            except:
                try:
                    missing_tokens = np.append(missing_tokens,tokens.Tweet[i][j])
                except:
                    pass
    
    vec_array = np.reshape(vec_array,(tokens.shape[0],max_len,vector_size)) #reshape vec_array
    missing_tokens = np.unique(missing_tokens)
    found_tokens = np.unique(found_tokens)
    
    return (vec_array, id_array, labels, max_len, missing_tokens, found_tokens)
        

## Transform the tweets to word embedding arrays

In [10]:
vec_len = 300
import pickle
import pandas as pd
train_data = pd.read_pickle('train_data_formatted.pickle?raw=true')
eval_data = pd.read_pickle('eval_data_formatted.pickle?raw=true')
train_data, train_id, train_labels, train_max_len, train_missing_tokens, train_found_tokens = embedding(train_data,vec_len)
eval_data, eval_id, eval_labels, eval_max_len, eval_missing_tokens, eval_found_tokens = embedding(eval_data,vec_len)
train_labels = train_labels.astype(int)
eval_labels = eval_labels.astype(int)
del w #clear some memory

57
0
500
1000
1500
2000
2500
3000
3500
4000
55
0
500
1000


## Authenticate location for saving files

In [11]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

FILE_OUTPUT_DIR = 'classify_embeddings_glove'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  FILE_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, FILE_OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(FILE_OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(FILE_OUTPUT_DIR)
print('***** File output directory: {} *****'.format(FILE_OUTPUT_DIR))

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

***** File output directory: gs://dissertation_bucket/classify_embeddings_glove *****


## Save file detailing tokens that were found

In [0]:
max_len = max(train_max_len, eval_max_len)
found_tokens = np.unique(np.append(train_found_tokens,eval_found_tokens))
missing_tokens = np.unique(np.append(train_missing_tokens,eval_missing_tokens))
found = pd.DataFrame({'token': found_tokens})
found['found'] = 1
missing = pd.DataFrame({'token': missing_tokens})
missing['found'] = 0
tokens = found.append(missing)
filename = FILE_OUTPUT_DIR + '/tokens.csv'
tokens.to_csv(filename)

## Function for summing embeddings over each tweet

In [0]:
def aggregate_vectors(data,max_len,tweet_id):
  count_features = pd.read_pickle('count_features.pickle?raw=true')
  count_features_list = ['ave_chars_token', 'caps_count', 'followers_count', 'following_count',
                         'mention_count', 'neg_sent', 'neu_sent', 'pos_sent','posted_tweets_count',
                         'punctuation_count', 'quotes_count', 'url_count'] 
  embed_vec_sum = np.zeros(data.shape[0]*vec_len) #initialize an array to hold sum of embeddings

  embed_vec_sum = np.reshape(embed_vec_sum,(data.shape[0],vec_len)) #reshape embed_vec_sum

  for i in range(0,data.shape[0]): #loop through the arrays
    token_count = max_len #there are max_len potential tokens in each array
    if np.sum(data[i])==0: #some tweets might have vectors that are all zeros
      embed_vec_sum[i] = np.zeros(vec_len) #set vector to zeros
    else:
      if i%500==0:
        print(i) #print progress
      for j in range(0,max_len):
        if np.sum(data[i][j])==0: #check for tokens that are currently zeros
          token_count = token_count-1 #reduce token count by 1 when token is all zeros
      embed_vec_sum[i] = np.add.reduce(data[i]) #add each of the individual word embeddings in the tweet

  
  
  #*******************create arrays including count features**********************************************
  embed_vec_count_sum = np.zeros(data.shape[0]*(vec_len+len(count_features_list))) #initialize an array to hold sum of embeddings inc count features

  embed_vec_count_sum = np.reshape(embed_vec_count_sum,(data.shape[0],(vec_len+len(count_features_list)))) #reshape embed_vec_count_sum
  
  for i in range(0,data.shape[0]): #loop through each tweet
    add_count = count_features[(count_features.tweet_ids==tweet_id[i])&(count_features.feature.isin(count_features_list))]['value'].values #count features for each tweet
    if len(add_count)==0: #if there are no count features for tweet
      embed_vec_count_sum[i] = np.append(embed_vec_sum[i],np.zeros(len(count_features_list))) #add zeros

    else:
      embed_vec_count_sum[i] = np.append(embed_vec_sum[i],add_count) #append the count features to the embedding features
    
      
      
  return(embed_vec_sum,embed_vec_count_sum)
  #return(embed_vec_sum,embed_vec_ave)
      


## Sum embeddings over each tweet

In [14]:
train_vec_sum, train_vec_count_sum = aggregate_vectors(train_data,train_max_len,train_id)
eval_vec_sum, eval_vec_count_sum = aggregate_vectors(eval_data,eval_max_len,eval_id)

0
500
1000
1500
2000
2500
3000
3500
4000
0
500
1000


## Import classifiers and metrics

In [0]:
from sklearn.linear_model import LogisticRegression #import lr
from sklearn.svm import SVC #import svm
from sklearn.tree import DecisionTreeClassifier #import dt
from sklearn.ensemble import RandomForestClassifier #import rf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics
from sklearn.model_selection import GridSearchCV #grid search
log_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

## Grid search and evaluation function

In [0]:
def search_grid(classifier, model, x_train, y_train, class_train, x_eval, y_eval, class_eval):
    
    if model == 'lr': #if using logisitic regression
        param_grid = [{'random_state':[42],
               'C':[0.05,0.1,0.5,1],
               'penalty':['l1','l2']}]
        
    if model == 'dt': #if using decision tree
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}]
        
    if model == 'rf': #if using random forest
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}] 
    
    if model == 'svm': #if using svm
        param_grid = [{'random_state':[42],
                   'C':[0.05,0.1,1,10], 
                   'kernel':['linear','rbf']}]
    
  
    param_grid = param_grid
    grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall') #grid search using 10-folds cross validation
    grid_search.fit(x_train, y_train) #fir grid search
    print("")
    print('Best parameters')
    best_parameters = grid_search.best_params_
    print(best_parameters) #print best parameters from grid search
    print('Best grid search score = ',grid_search.best_score_) #print best grid search score
    print("")
    print('Evaluation data scores')
    tuned_clf = grid_search.best_estimator_ #build model using best parameters
    tuned_clf_pred = tuned_clf.predict(x_eval) #predict using evaluation data with best parameters
    conf_matrix = confusion_matrix(y_eval,tuned_clf_pred) #build confusion matrix
    precision = precision_score(y_eval,tuned_clf_pred) #calculate precision
    recall = recall_score(y_eval,tuned_clf_pred) #calculate recall
    f1 = f1_score(y_eval,tuned_clf_pred) #calculate f1
    fpr, tpr, thresholds = roc_curve(y_eval,tuned_clf_pred)
    auc_score = auc(fpr, tpr) #calculate auc
    accuracy = accuracy_score(y_eval,tuned_clf_pred) #calculate accuracy
    class_eval['pred'] = tuned_clf_pred
    class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
    print(conf_matrix)
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    print('f1 = ' + str(f1))
    print('auc = ' + str(auc_score))
    print('accuracy = ' + str(accuracy))
    
    return(best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval) #return metrics and pred vs actuals for each tweet

## Perform grid search and evaluation for each classifier and feature combo

In [17]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import scale

current = datetime.now()

tf = []     #initialise empty vectors to hold results
name = []
bp = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

features = 'glove_300'

trains = [train_vec_sum, train_vec_count_sum]
evals = [eval_vec_sum, eval_vec_count_sum]
descs = ['sum','sum_plusCounts']

i=1
for train, eval, desc in zip(trains, evals, descs):
  
  train = scale(train)
  eval = scale(eval)

  classifiers = [log_clf, dt_clf, rf_clf, svc_clf] #the classifiers that are to be tested
  models = ['lr','dt','rf', 'svm'] #labels for identifying the results

  x_train = train
  y_train = train_labels



  x_eval = eval
  y_eval = eval_labels


  for classifier, model in zip(classifiers,models):
    
    class_train = pd.DataFrame({'tweet_id':train_id,
                                'class_column':y_train})
    
    class_eval = pd.DataFrame({'tweet_id':eval_id,
                                'class_column':y_eval})    
    
    best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier, 
                                                                                                       model, 
                                                                                                       x_train, 
                                                                                                       y_train, 
                                                                                                       class_train, 
                                                                                                       x_eval, 
                                                                                                       y_eval, 
                                                                                                       class_eval)
    #append the latest results to the vectors
    tf_text = features + "_" + desc
    tf = np.append(tf,tf_text)
    name = np.append(name,model)
    b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
    bp = np.append(bp,b)
    tn = np.append(tn,conf_matrix[0][0])
    fp = np.append(fp,conf_matrix[0][1])
    fn = np.append(fn,conf_matrix[1][0])
    tp = np.append(tp,conf_matrix[1][1])
    p = np.append(p,precision)
    r = np.append(r,recall)
    f_1 = np.append(f_1,f1)
    auc_sc = np.append(auc_sc,auc_score)
    acc = np.append(acc,accuracy)

    #col = train_file+'_'+model #build a column name
    #class_eval.columns = ['tweet_id',col] #rename the columns
    class_eval['model'] = model
    class_eval['file'] = tf_text
    if i==1: #if we are on the first iteration of the loop
        df = class_eval.copy()
    else: #if we are not on the first iteration f the loop
        #df = pd.merge(df, class_eval, on='tweet_id')
        df = df.append(class_eval)

    i = i+1 #increment i
    print('df shape = ', df.shape)

print('time taken = ',datetime.now() - current) #print the time taken
    



Best parameters
{'C': 1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8120326541570113

Evaluation data scores
[[809  22]
 [ 44 146]]
precision = 0.8690476190476191
recall = 0.7684210526315789
f1 = 0.8156424581005587
auc = 0.8709734625372094
accuracy = 0.9353574926542605
df shape =  (1021, 4)

Best parameters
{'criterion': 'entropy', 'random_state': 42}
Best grid search score =  0.5963933476516136

Evaluation data scores
[[768  63]
 [ 73 117]]
precision = 0.65
recall = 0.6157894736842106
f1 = 0.6324324324324324
auc = 0.7699885996579897
accuracy = 0.8667972575905974
df shape =  (2042, 4)

Best parameters
{'criterion': 'entropy', 'random_state': 42}
Best grid search score =  0.46997170491154694

Evaluation data scores
[[825   6]
 [104  86]]
precision = 0.9347826086956522
recall = 0.45263157894736844
f1 = 0.6099290780141844
auc = 0.7227056811704352
accuracy = 0.8922624877571009
df shape =  (3063, 4)

Best parameters
{'C': 0.1, 'kernel': 'linear', 'random_state': 42}
B

## Create dataframe for performance metrics

In [18]:
classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

classifications.sort_values(by='f_1', ascending=False)

Unnamed: 0,tf,name,bp,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
7,glove_300_sum_plusCounts,svm,C 0.05;kernel linear;random_state 42,818.0,13.0,34.0,156.0,0.923077,0.821053,0.869081,0.902704,0.953967
4,glove_300_sum_plusCounts,lr,C 0.05;penalty l2;random_state 42,818.0,13.0,36.0,154.0,0.922156,0.810526,0.862745,0.897441,0.952008
3,glove_300_sum,svm,C 0.1;kernel linear;random_state 42,812.0,19.0,43.0,147.0,0.885542,0.773684,0.825843,0.87541,0.939275
0,glove_300_sum,lr,C 1;penalty l2;random_state 42,809.0,22.0,44.0,146.0,0.869048,0.768421,0.815642,0.870973,0.935357
5,glove_300_sum_plusCounts,dt,criterion entropy;random_state 42,779.0,52.0,74.0,116.0,0.690476,0.610526,0.648045,0.773976,0.876592
1,glove_300_sum,dt,criterion entropy;random_state 42,768.0,63.0,73.0,117.0,0.65,0.615789,0.632432,0.769989,0.866797
2,glove_300_sum,rf,criterion entropy;random_state 42,825.0,6.0,104.0,86.0,0.934783,0.452632,0.609929,0.722706,0.892262
6,glove_300_sum_plusCounts,rf,criterion entropy;random_state 42,823.0,8.0,108.0,82.0,0.911111,0.431579,0.585714,0.710976,0.886386


## Ensemble Classifier (vector sums)

In [19]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression(penalty='l2',C=1, random_state=42) #logistic regression with best hyperparameters
svc_clf = SVC(C=0.1, kernel='linear', probability = True, random_state=42) #svm with best hyperparameters

#create the ensemble
e_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svc_clf)],
                         voting='soft', weights=[1, 1])


features = 'glove_300'

 
train = scale(train_vec_sum)
eval = scale(eval_vec_sum)
desc = 'sum'
model = 'ensemble'

x_train = train
y_train = train_labels

x_eval = eval
y_eval = eval_labels


    
class_train = pd.DataFrame({'tweet_id':train_id,
                            'class_column':y_train})

class_eval = pd.DataFrame({'tweet_id':eval_id,
                            'class_column':y_eval}) 


e_clf = e_clf.fit(x_train, y_train) #fit the ensemble

e_clf_pred = e_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,e_clf_pred) #build confusion matrix
precision = precision_score(y_eval,e_clf_pred) #calculate precision
recall = recall_score(y_eval,e_clf_pred) #calculate recall
f1 = f1_score(y_eval,e_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,e_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,e_clf_pred) #calculate accuracy
class_eval['pred'] = e_clf_pred
class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

#append the latest results to the vectors
tf_text = features + "_" + desc
tf = np.append(tf,tf_text)
name = np.append(name,'ensemble (lr,svc)')
bp = np.append(bp,'ensemble')
tn = np.append(tn,conf_matrix[0][0])
fp = np.append(fp,conf_matrix[0][1])
fn = np.append(fn,conf_matrix[1][0])
tp = np.append(tp,conf_matrix[1][1])
p = np.append(p,precision)
r = np.append(r,recall)
f_1 = np.append(f_1,f1)
auc_sc = np.append(auc_sc,auc_score)
acc = np.append(acc,accuracy)

class_eval['model'] = 'ensemble (lr,svc)'
class_eval['file'] = tf_text

df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier
print('df shape = ', df.shape)

[[814  17]
 [ 45 145]]
precision = 0.8950617283950617
recall = 0.7631578947368421
f1 = 0.8238636363636365
auc = 0.8713503071758819
accuracy = 0.9392752203721841
df shape =  (9189, 4)


## Ensemble Classifier 2 (vector sums plus counts)

In [20]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression(penalty='l2',C=0.05, random_state=42) #logistic regression with best hyperparameters
svc_clf = SVC(C=0.05, kernel='linear', probability = True, random_state=42) #svm with best hyperparameters

#create the ensemble
e_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svc_clf)],
                         voting='soft', weights=[1, 1])


features = 'glove_300'

 
train = scale(train_vec_count_sum)
eval = scale(eval_vec_count_sum)
desc = 'sum_plusCounts'
model = 'ensemble'

x_train = train
y_train = train_labels

x_eval = eval
y_eval = eval_labels


    
class_train = pd.DataFrame({'tweet_id':train_id,
                            'class_column':y_train})

class_eval = pd.DataFrame({'tweet_id':eval_id,
                            'class_column':y_eval}) 


e_clf = e_clf.fit(x_train, y_train) #fit the ensemble

e_clf_pred = e_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,e_clf_pred) #build confusion matrix
precision = precision_score(y_eval,e_clf_pred) #calculate precision
recall = recall_score(y_eval,e_clf_pred) #calculate recall
f1 = f1_score(y_eval,e_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,e_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,e_clf_pred) #calculate accuracy
class_eval['pred'] = e_clf_pred
class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

#append the latest results to the vectors
tf_text = features + "_" + desc
tf = np.append(tf,tf_text)
name = np.append(name,'ensemble (lr,svc)')
bp = np.append(bp,'ensemble')
tn = np.append(tn,conf_matrix[0][0])
fp = np.append(fp,conf_matrix[0][1])
fn = np.append(fn,conf_matrix[1][0])
tp = np.append(tp,conf_matrix[1][1])
p = np.append(p,precision)
r = np.append(r,recall)
f_1 = np.append(f_1,f1)
auc_sc = np.append(auc_sc,auc_score)
acc = np.append(acc,accuracy)

class_eval['model'] = 'ensemble (lr,svc)'
class_eval['file'] = tf_text

df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier
print('df shape = ', df.shape)

[[819  12]
 [ 38 152]]
precision = 0.926829268292683
recall = 0.8
f1 = 0.8587570621468926
auc = 0.8927797833935018
accuracy = 0.951028403525955
df shape =  (10210, 4)


In [21]:
classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

classifications.sort_values(by='f_1', ascending=False)

Unnamed: 0,tf,name,bp,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
7,glove_300_sum_plusCounts,svm,C 0.05;kernel linear;random_state 42,818.0,13.0,34.0,156.0,0.923077,0.821053,0.869081,0.902704,0.953967
4,glove_300_sum_plusCounts,lr,C 0.05;penalty l2;random_state 42,818.0,13.0,36.0,154.0,0.922156,0.810526,0.862745,0.897441,0.952008
9,glove_300_sum_plusCounts,"ensemble (lr,svc)",ensemble,819.0,12.0,38.0,152.0,0.926829,0.8,0.858757,0.89278,0.951028
3,glove_300_sum,svm,C 0.1;kernel linear;random_state 42,812.0,19.0,43.0,147.0,0.885542,0.773684,0.825843,0.87541,0.939275
8,glove_300_sum,"ensemble (lr,svc)",ensemble,814.0,17.0,45.0,145.0,0.895062,0.763158,0.823864,0.87135,0.939275
0,glove_300_sum,lr,C 1;penalty l2;random_state 42,809.0,22.0,44.0,146.0,0.869048,0.768421,0.815642,0.870973,0.935357
5,glove_300_sum_plusCounts,dt,criterion entropy;random_state 42,779.0,52.0,74.0,116.0,0.690476,0.610526,0.648045,0.773976,0.876592
1,glove_300_sum,dt,criterion entropy;random_state 42,768.0,63.0,73.0,117.0,0.65,0.615789,0.632432,0.769989,0.866797
2,glove_300_sum,rf,criterion entropy;random_state 42,825.0,6.0,104.0,86.0,0.934783,0.452632,0.609929,0.722706,0.892262
6,glove_300_sum_plusCounts,rf,criterion entropy;random_state 42,823.0,8.0,108.0,82.0,0.911111,0.431579,0.585714,0.710976,0.886386


## Save the performance metrics and the predictions per tweet

In [22]:
classifications.to_pickle('/tmp/classifications.pickle')
df.to_pickle('/tmp/df.pickle')
print('df shape = ', df.shape)

!gsutil cp /tmp/classifications.pickle gs://dissertation_bucket/classify_embeddings_glove/
!gsutil cp /tmp/df.pickle gs://dissertation_bucket/classify_embeddings_glove/

df shape =  (10210, 4)
Copying file:///tmp/classifications.pickle [Content-Type=application/octet-stream]...
/ [1 files][  2.1 KiB/  2.1 KiB]                                                
Operation completed over 1 objects/2.1 KiB.                                      
Copying file:///tmp/df.pickle [Content-Type=application/octet-stream]...
/ [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    
