## Common Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.utils import shuffle

## Create data and class inputs

In [2]:
def xysplit(file_location):
    df = pd.read_pickle(file_location) #read file
    df['tweet_id'] = df.tweet_id.astype(str) #change tweet_id to string
    df = shuffle(df, random_state=42)
    df_class = df.loc[:,['tweet_id','class_column']] #create a df of classes per tweet_id
    x_df = df.drop(['tweet_id','class_column'], axis=1).values #drop tweet_id and class
    x_df = scale(x_df) #scale the data
    y_df = df_class.class_column.values #obtain a vector of classes
    return(x_df,y_df, df_class)


## Create classifiers and import metrics

In [3]:
from sklearn.linear_model import LogisticRegression #import lr
from sklearn.svm import SVC #import svm
from sklearn.tree import DecisionTreeClassifier #import dt
from sklearn.ensemble import RandomForestClassifier #import rf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics
from sklearn.model_selection import GridSearchCV #grid search
log_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

## Function for fine tuning and evaluating classifiers

In [4]:
def search_grid(classifier,model, train_file, eval_file):
    
    if model == 'lr': #if using logisitic regression
        param_grid = [{'random_state':[42],
               'C':[0.05,0.1,0.5,1],
               'penalty':['l1','l2']}]
        
    if model == 'dt': #if using decision tree
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}]
        
    if model == 'rf': #if using random forest
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}] 
    
    if model == 'svm': #if using svm
        param_grid = [{'random_state':[42],
                   'C':[0.05,0.1,1,10], 
                   'kernel':['linear','rbf']}]
    
    x_train, y_train, class_train = xysplit(train_file) #split training data into X, Y
    x_eval, y_eval, class_eval = xysplit(eval_file) #split evaluation data into X, Y
    
    param_grid = param_grid
    grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall') #grid search using 10-folds cross validation
    grid_search.fit(x_train, y_train) #fir grid search
    print("")
    print('Best parameters')
    best_parameters = grid_search.best_params_
    print(best_parameters) #print best parameters from grid search
    print('Best grid search score = ',grid_search.best_score_) #print best grid search score
    print("")
    print('Evaluation data scores')
    tuned_clf = grid_search.best_estimator_ #build model using best parameters
    tuned_clf_pred = tuned_clf.predict(x_eval) #predict using evaluation data with best parameters
    conf_matrix = confusion_matrix(y_eval,tuned_clf_pred) #build confusion matrix
    precision = precision_score(y_eval,tuned_clf_pred) #calculate precision
    recall = recall_score(y_eval,tuned_clf_pred) #calculate recall
    f1 = f1_score(y_eval,tuned_clf_pred) #calculate f1
    fpr, tpr, thresholds = roc_curve(y_eval,tuned_clf_pred)
    auc_score = auc(fpr, tpr) #calculate auc
    accuracy = accuracy_score(y_eval,tuned_clf_pred) #calculate accuracy
    class_eval['pred'] = tuned_clf_pred
    class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
    print(conf_matrix)
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    print('f1 = ' + str(f1))
    print('auc = ' + str(auc_score))
    print('accuracy = ' + str(accuracy))
    
    return(best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval) #return metrics and pred vs actuals for each tweet

## Run the "search_grid()" function for lr, dt, and rf

In [5]:
from datetime import datetime
current = datetime.now() #for checking duration

tf = []     #initialise empty vectors to hold results
name = []
bp = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

classifiers = [log_clf, dt_clf, rf_clf] #the classifiers that are to be tested
models = ['lr','dt','rf'] #labels for identifying the results

train_files = ['features/df_tweet_tfidf_train.pickle', #the file locations for the training data sets inc count features
               'features/df_tweet_tf_train.pickle',
               'features/df_tweet_train.pickle', 
               'features/df_tweetbio_tfidf_train.pickle',
               'features/df_tweetbio_tf_train.pickle',
               'features/df_tweetbio_train.pickle',
               'features/df_tweet_tfidf_train_nc.pickle', #the file locations for the training data sets exc count features
               'features/df_tweet_tf_train_nc.pickle',
               'features/df_tweet_train_nc.pickle', 
               'features/df_tweetbio_tfidf_train_nc.pickle',
               'features/df_tweetbio_tf_train_nc.pickle',
               'features/df_tweetbio_train_nc.pickle',
               'features/df_tweet_count_features_train.pickle'] #count features

eval_files = ['features/df_tweet_tfidf_eval.pickle', #the file locations for the evaluation data sets inc count features
              'features/df_tweet_tf_eval.pickle', 
              'features/df_tweet_eval.pickle', 
              'features/df_tweetbio_tfidf_eval.pickle', 
              'features/df_tweetbio_tf_eval.pickle',
              'features/df_tweetbio_eval.pickle',
              'features/df_tweet_tfidf_eval_nc.pickle', #the file locations for the evaluation data sets exc count features
              'features/df_tweet_tf_eval_nc.pickle', 
              'features/df_tweet_eval_nc.pickle', 
              'features/df_tweetbio_tfidf_eval_nc.pickle', 
              'features/df_tweetbio_tf_eval_nc.pickle',
              'features/df_tweetbio_eval_nc.pickle',
              'features/df_tweet_count_features_eval.pickle'] #count features

i=1 #a counter to be used for checking loop number
for classifier, model in zip(classifiers, models): #zip through the classifiers and model names
    for train_file, eval_file in zip(train_files, eval_files): #zip through the training and evaluation combos
        #execute the search_grid() function
        best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier,model, train_file, eval_file)
        
        #append the latest results to the vectors
        tf = np.append(tf,train_file)
        name = np.append(name,model)
        b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
        bp = np.append(bp,b)
        tn = np.append(tn,conf_matrix[0][0])
        fp = np.append(fp,conf_matrix[0][1])
        fn = np.append(fn,conf_matrix[1][0])
        tp = np.append(tp,conf_matrix[1][1])
        p = np.append(p,precision)
        r = np.append(r,recall)
        f_1 = np.append(f_1,f1)
        auc_sc = np.append(auc_sc,auc_score)
        acc = np.append(acc,accuracy)
        
        #col = train_file+'_'+model #build a column name
        #class_eval.columns = ['tweet_id',col] #rename the columns
        class_eval['model'] = model
        class_eval['file'] = eval_file
        if i==1: #if we are on the first iteration of the loop
            df = class_eval.copy()
        else: #if we are not on the first iteration f the loop
            #df = pd.merge(df, class_eval, on='tweet_id')
            df = df.append(class_eval)
        
        i = i+1 #increment i

print('time taken = ',datetime.now() - current) #print the time taken


Best parameters
{'C': 0.1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8320946725635374

Evaluation data scores
[[809  22]
 [ 31 159]]
precision = 0.8784530386740331
recall = 0.8368421052631579
f1 = 0.8571428571428571
auc = 0.9051839888529989
accuracy = 0.9480901077375122

Best parameters
{'C': 0.1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8320946725635374

Evaluation data scores
[[809  22]
 [ 31 159]]
precision = 0.8784530386740331
recall = 0.8368421052631579
f1 = 0.8571428571428571
auc = 0.9051839888529989
accuracy = 0.9480901077375122

Best parameters
{'C': 0.5, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8428635265359182

Evaluation data scores
[[807  24]
 [ 27 163]]
precision = 0.8716577540106952
recall = 0.8578947368421053
f1 = 0.8647214854111406
auc = 0.9145069352080564
accuracy = 0.9500489715964741

Best parameters
{'C': 0.05, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8429128044989543

Eva


Best parameters
{'criterion': 'gini', 'random_state': 42}
Best grid search score =  0.4345692270012623

Evaluation data scores
[[823   8]
 [104  86]]
precision = 0.9148936170212766
recall = 0.45263157894736844
f1 = 0.6056338028169014
auc = 0.7215023117360188
accuracy = 0.8903036238981391

Best parameters
{'criterion': 'gini', 'random_state': 42}
Best grid search score =  0.4345692270012623

Evaluation data scores
[[823   8]
 [104  86]]
precision = 0.9148936170212766
recall = 0.45263157894736844
f1 = 0.6056338028169014
auc = 0.7215023117360188
accuracy = 0.8903036238981391

Best parameters
{'criterion': 'entropy', 'random_state': 42}
Best grid search score =  0.42223637172893236

Evaluation data scores
[[825   6]
 [104  86]]
precision = 0.9347826086956522
recall = 0.45263157894736844
f1 = 0.6099290780141844
auc = 0.7227056811704352
accuracy = 0.8922624877571009

Best parameters
{'criterion': 'gini', 'random_state': 42}
Best grid search score =  0.5686002232521337

Evaluation data score

## Run the "search_grid()" function for svm (seperated process due to the long duration)

In [6]:
current = datetime.now()

classifiers = [svc_clf] #the classifier that is to be tested
models = ['svm'] #a lebale used to identify the results

train_files = ['features/df_tweet_train.pickle'] #the location of the training data

eval_files = ['features/df_tweet_eval.pickle'] #the location of the evaluation data

for classifier, model in zip(classifiers, models): #zip through the classifier and label combos
    for train_file, eval_file in zip(train_files, eval_files): #zip through the train and evaluation data combos
        #execute the search_grid() function
        best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier,model, train_file, eval_file)
        
        #append the latest results to the vectors
        tf = np.append(tf,train_file)
        name = np.append(name,model)
        b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
        bp = np.append(bp,b)
        tn = np.append(tn,conf_matrix[0][0])
        fp = np.append(fp,conf_matrix[0][1])
        fn = np.append(fn,conf_matrix[1][0])
        tp = np.append(tp,conf_matrix[1][1])
        p = np.append(p,precision)
        r = np.append(r,recall)
        f_1 = np.append(f_1,f1)
        auc_sc = np.append(auc_sc,auc_score)
        acc = np.append(acc,accuracy)
        
        class_eval['model'] = model
        class_eval['file'] = eval_file

        df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier


print('time taken = ',datetime.now() - current) #print the time taken


Best parameters
{'C': 0.05, 'kernel': 'linear', 'random_state': 42}
Best grid search score =  0.8058580135270069

Evaluation data scores
[[803  28]
 [ 36 154]]
precision = 0.8461538461538461
recall = 0.8105263157894737
f1 = 0.8279569892473119
auc = 0.8884159858129077
accuracy = 0.9373163565132223
time taken =  0:17:49.487263


## Take a look at the results so far

In [7]:
classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})

classifications.sort_values(by='f_1', ascending=False)

Unnamed: 0,tf,name,bp,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
2,features/df_tweet_train.pickle,lr,C 0.5;penalty l2;random_state 42,807.0,24.0,27.0,163.0,0.871658,0.857895,0.864721,0.914507,0.950049
0,features/df_tweet_tfidf_train.pickle,lr,C 0.1;penalty l2;random_state 42,809.0,22.0,31.0,159.0,0.878453,0.836842,0.857143,0.905184,0.94809
1,features/df_tweet_tf_train.pickle,lr,C 0.1;penalty l2;random_state 42,809.0,22.0,31.0,159.0,0.878453,0.836842,0.857143,0.905184,0.94809
15,features/df_tweet_train.pickle,dt,criterion gini;random_state 42,798.0,33.0,28.0,162.0,0.830769,0.852632,0.841558,0.90646,0.940255
8,features/df_tweet_train_nc.pickle,lr,C 0.5;penalty l2;random_state 42,805.0,26.0,36.0,154.0,0.855556,0.810526,0.832432,0.889619,0.939275
3,features/df_tweetbio_tfidf_train.pickle,lr,C 0.05;penalty l2;random_state 42,801.0,30.0,34.0,156.0,0.83871,0.821053,0.829787,0.892476,0.937316
4,features/df_tweetbio_tf_train.pickle,lr,C 0.05;penalty l2;random_state 42,801.0,30.0,34.0,156.0,0.83871,0.821053,0.829787,0.892476,0.937316
39,features/df_tweet_train.pickle,svm,C 0.05;kernel linear;random_state 42,803.0,28.0,36.0,154.0,0.846154,0.810526,0.827957,0.888416,0.937316
10,features/df_tweetbio_tf_train_nc.pickle,lr,C 0.05;penalty l2;random_state 42,801.0,30.0,35.0,155.0,0.837838,0.815789,0.826667,0.889844,0.936337
9,features/df_tweetbio_tfidf_train_nc.pickle,lr,C 0.05;penalty l2;random_state 42,801.0,30.0,35.0,155.0,0.837838,0.815789,0.826667,0.889844,0.936337


## Ensemble classifier

In [8]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression(penalty='l2',C=0.5, random_state=42) #logistic regression with best hyperparameters
svc_clf = SVC(C=0.05, kernel='linear', probability = True, random_state=42) #svm with best hyperparameters
dt_clf = DecisionTreeClassifier(criterion='gini', random_state=42) #random forest with best hyperparameters
rf_clf = RandomForestClassifier(criterion='gini', random_state=42) #random forest with best hyperparameters

#create the ensemble
e_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svc_clf), ('rf', rf_clf), ('dt', dt_clf)],
                         voting='soft', weights=[1, 1, 1, 1])

#get training and evaluation data
x_train, y_train, class_train = xysplit('features/df_tweet_train.pickle') #split training data into X, Y
x_eval, y_eval, class_eval = xysplit('features/df_tweet_eval.pickle') #split evaluation data into X, Y

e_clf = e_clf.fit(x_train, y_train) #fit the ensemble

e_clf_pred = e_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,e_clf_pred) #build confusion matrix
precision = precision_score(y_eval,e_clf_pred) #calculate precision
recall = recall_score(y_eval,e_clf_pred) #calculate recall
f1 = f1_score(y_eval,e_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,e_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,e_clf_pred) #calculate accuracy
class_eval['pred'] = e_clf_pred
class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

#append the latest results to the vectors
tf = np.append(tf,'features/df_tweet_train.pickle')
name = np.append(name,'ensemble (lr,svc,dt,rf)')
bp = np.append(bp,'ensemble')
tn = np.append(tn,conf_matrix[0][0])
fp = np.append(fp,conf_matrix[0][1])
fn = np.append(fn,conf_matrix[1][0])
tp = np.append(tp,conf_matrix[1][1])
p = np.append(p,precision)
r = np.append(r,recall)
f_1 = np.append(f_1,f1)
auc_sc = np.append(auc_sc,auc_score)
acc = np.append(acc,accuracy)

class_eval['model'] = 'ensemble (lr,svc,dt,rf)'
class_eval['file'] = 'features/df_tweet_eval.pickle'

df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier
        




[[821  10]
 [ 36 154]]
precision = 0.9390243902439024
recall = 0.8105263157894737
f1 = 0.8700564971751412
auc = 0.8992463107226549
accuracy = 0.9549461312438785


  if diff:


## Save the results

In [9]:
df.to_pickle('results/preds_per_tweet_local.pickle') #pickle the results of actual vs predicted for each tweet

classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})
classifications.to_pickle('results/metrics_local.pickle') #pickle the metrics

## Print metric results descending by f1 value

In [None]:
classifications.sort_values(by='f_1', ascending=False)