## Common Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.utils import shuffle

## Create data and class inputs

In [2]:
def xysplit(file_location):
    df = pd.read_pickle(file_location) #read file
    df['tweet_id'] = df.tweet_id.astype(str) #change tweet_id to string
    df = shuffle(df, random_state=42)
    df_class = df.loc[:,['tweet_id','class_column']] #create a df of classes per tweet_id
    x_df = df.drop(['tweet_id','class_column'], axis=1).values #drop tweet_id and class
    x_df = scale(x_df) #scale the data
    y_df = df_class.class_column.values #obtain a vector of classes
    return(x_df,y_df, df_class)


## Create classifiers and import metrics

In [3]:
from sklearn.linear_model import LogisticRegression #import lr
from sklearn.svm import SVC #import svm
from sklearn.tree import DecisionTreeClassifier #import dt
from sklearn.ensemble import RandomForestClassifier #import rf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics
from sklearn.model_selection import GridSearchCV #grid search
log_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

  from numpy.core.umath_tests import inner1d


## Function for fine tuning and evaluating classifiers

In [4]:
def search_grid(classifier,model, train_file, eval_file):
    
    if model == 'lr': #if using logisitic regression
        param_grid = [{'random_state':[42],
               'C':[0.05,0.1,0.5,1],
               'penalty':['l1','l2']}]
        
    if model == 'dt': #if using decision tree
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}]
        
    if model == 'rf': #if using random forest
        param_grid = [{'random_state':[42],
                       'criterion':['gini','entropy']}] 
    
    if model == 'svm': #if using svm
        param_grid = [{'random_state':[42],
                   'C':[0.05,0.1,1,10], 
                   'kernel':['linear','rbf']}]
    
    x_train, y_train, class_train = xysplit(train_file) #split training data into X, Y
    x_eval, y_eval, class_eval = xysplit(eval_file) #split evaluation data into X, Y
    
    param_grid = param_grid
    grid_search = GridSearchCV(classifier, param_grid, cv=10, scoring='recall') #grid search using 10-folds cross validation
    grid_search.fit(x_train, y_train) #fir grid search
    print("")
    print('Best parameters')
    best_parameters = grid_search.best_params_
    print(best_parameters) #print best parameters from grid search
    print('Best grid search score = ',grid_search.best_score_) #print best grid search score
    print("")
    print('Evaluation data scores')
    tuned_clf = grid_search.best_estimator_ #build model using best parameters
    tuned_clf_pred = tuned_clf.predict(x_eval) #predict using evaluation data with best parameters
    conf_matrix = confusion_matrix(y_eval,tuned_clf_pred) #build confusion matrix
    precision = precision_score(y_eval,tuned_clf_pred) #calculate precision
    recall = recall_score(y_eval,tuned_clf_pred) #calculate recall
    f1 = f1_score(y_eval,tuned_clf_pred) #calculate f1
    fpr, tpr, thresholds = roc_curve(y_eval,tuned_clf_pred)
    auc_score = auc(fpr, tpr) #calculate auc
    accuracy = accuracy_score(y_eval,tuned_clf_pred) #calculate accuracy
    class_eval['pred'] = tuned_clf_pred
    class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
    print(conf_matrix)
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    print('f1 = ' + str(f1))
    print('auc = ' + str(auc_score))
    print('accuracy = ' + str(accuracy))
    
    return(best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval) #return metrics and pred vs actuals for each tweet

## Run the "search_grid()" function for lr, dt, and rf

In [5]:
from datetime import datetime
current = datetime.now() #for checking duration

tf = []     #initialise empty vectors to hold results
name = []
bp = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

classifiers = [log_clf, dt_clf, rf_clf] #the classifiers that are to be tested
models = ['lr','dt','rf'] #labels for identifying the results

train_files = ['features/df_tweet_tfidf_train.pickle', #the file locations for the training data sets
               'features/df_tweet_tf_train.pickle',
               'features/df_tweet_train.pickle', 
               'features/df_tweetbio_tfidf_train.pickle',
               'features/df_tweetbio_tf_train.pickle',
               'features/df_tweetbio_train.pickle']

eval_files = ['features/df_tweet_tfidf_eval.pickle', #the file locations for the evaluation data sets
              'features/df_tweet_tf_eval.pickle', 
              'features/df_tweet_eval.pickle', 
              'features/df_tweetbio_tfidf_eval.pickle', 
              'features/df_tweetbio_tf_eval.pickle',
              'features/df_tweetbio_eval.pickle']

i=1 #a counter to be used for checking loop number
for classifier, model in zip(classifiers, models): #zip through the classifiers and model names
    for train_file, eval_file in zip(train_files, eval_files): #zip through the training and evaluation combos
        #execute the search_grid() function
        best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier,model, train_file, eval_file)
        
        #append the latest results to the vectors
        tf = np.append(tf,train_file)
        name = np.append(name,model)
        b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
        bp = np.append(bp,b)
        tn = np.append(tn,conf_matrix[0][0])
        fp = np.append(fp,conf_matrix[0][1])
        fn = np.append(fn,conf_matrix[1][0])
        tp = np.append(tp,conf_matrix[1][1])
        p = np.append(p,precision)
        r = np.append(r,recall)
        f_1 = np.append(f_1,f1)
        auc_sc = np.append(auc_sc,auc_score)
        acc = np.append(acc,accuracy)
        
        #col = train_file+'_'+model #build a column name
        #class_eval.columns = ['tweet_id',col] #rename the columns
        class_eval['model'] = model
        class_eval['file'] = eval_file
        if i==1: #if we are on the first iteration of the loop
            df = class_eval.copy()
        else: #if we are not on the first iteration f the loop
            #df = pd.merge(df, class_eval, on='tweet_id')
            df = df.append(class_eval)
        
        i = i+1 #increment i

print('time taken = ',datetime.now() - current) #print the time taken


Best parameters
{'C': 0.05, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.832074478607359

Evaluation data scores
[[814  17]
 [ 34 156]]
precision = 0.9017341040462428
recall = 0.8210526315789474
f1 = 0.8595041322314049
auc = 0.9002976755969345
accuracy = 0.9500489715964741

Best parameters
{'C': 0.05, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.832074478607359

Evaluation data scores
[[814  17]
 [ 34 156]]
precision = 0.9017341040462428
recall = 0.8210526315789474
f1 = 0.8595041322314049
auc = 0.9002976755969345
accuracy = 0.9500489715964741

Best parameters
{'C': 1, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8413350854386856

Evaluation data scores
[[804  27]
 [ 27 163]]
precision = 0.8578947368421053
recall = 0.8578947368421053
f1 = 0.8578947368421053
auc = 0.9127018810564317
accuracy = 0.9471106758080313

Best parameters
{'C': 0.05, 'penalty': 'l2', 'random_state': 42}
Best grid search score =  0.8336535517813072

Evalu

## Run the "search_grid()" function for svm (seperated process due to the long duration)

In [6]:
current = datetime.now()

classifiers = [svc_clf] #the classifier that is to be tested
models = ['svm'] #a lebale used to identify the results

train_files = ['features/df_tweet_train.pickle'] #the location of the training data

eval_files = ['features/df_tweet_eval.pickle'] #the location of the evaluation data

for classifier, model in zip(classifiers, models): #zip through the classifier and label combos
    for train_file, eval_file in zip(train_files, eval_files): #zip through the train and evaluation data combos
        #execute the search_grid() function
        best_parameters, conf_matrix, precision, recall, f1, auc_score, accuracy, class_eval = search_grid(classifier,model, train_file, eval_file)
        
        #append the latest results to the vectors
        tf = np.append(tf,train_file)
        name = np.append(name,model)
        b = ';'.join('{} {}'.format(key, val) for key, val in best_parameters.items())
        bp = np.append(bp,b)
        tn = np.append(tn,conf_matrix[0][0])
        fp = np.append(fp,conf_matrix[0][1])
        fn = np.append(fn,conf_matrix[1][0])
        tp = np.append(tp,conf_matrix[1][1])
        p = np.append(p,precision)
        r = np.append(r,recall)
        f_1 = np.append(f_1,f1)
        auc_sc = np.append(auc_sc,auc_score)
        acc = np.append(acc,accuracy)
        
        class_eval['model'] = model
        class_eval['file'] = eval_file

        df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier


print('time taken = ',datetime.now() - current) #print the time taken


Best parameters
{'C': 0.05, 'kernel': 'linear', 'random_state': 42}
Best grid search score =  0.8058744395146857

Evaluation data scores
[[802  29]
 [ 37 153]]
precision = 0.8406593406593407
recall = 0.8052631578947368
f1 = 0.8225806451612903
auc = 0.8851827221483312
accuracy = 0.9353574926542605
time taken =  0:19:11.994151


## Ensemble classifier 1

In [9]:
from sklearn.ensemble import VotingClassifier
log_clf = LogisticRegression(penalty='l2',C=1, random_state=42) #logistic regression with best hyperparameters
svc_clf = SVC(C=0.05, kernel='linear', probability = True, random_state=42) #svm with best hyperparameters
rf_clf = DecisionTreeClassifier(criterion='gini', random_state=42) #random forest with best hyperparameters
dt_clf = RandomForestClassifier(criterion='gini', random_state=42) #random forest with best hyperparameters

#create the ensemble
e_clf = VotingClassifier(estimators=[('lr', log_clf), ('svm', svc_clf), ('rf', rf_clf), ('dt', dt_clf)],
                         voting='soft', weights=[1, 1, 1, 1])

#get training and evaluation data
x_train, y_train, class_train = xysplit('features/df_tweet_train.pickle') #split training data into X, Y
x_eval, y_eval, class_eval = xysplit('features/df_tweet_eval.pickle') #split evaluation data into X, Y

e_clf = e_clf.fit(x_train, y_train) #fit the ensemble

e_clf_pred = e_clf.predict(x_eval) #predict using evaluation data with best parameters
conf_matrix = confusion_matrix(y_eval,e_clf_pred) #build confusion matrix
precision = precision_score(y_eval,e_clf_pred) #calculate precision
recall = recall_score(y_eval,e_clf_pred) #calculate recall
f1 = f1_score(y_eval,e_clf_pred) #calculate f1
fpr, tpr, thresholds = roc_curve(y_eval,e_clf_pred)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_eval,e_clf_pred) #calculate accuracy
class_eval['pred'] = e_clf_pred
class_eval = class_eval.drop('class_column', axis=1) #join predictions onto actuals
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

#append the latest results to the vectors
tf = np.append(tf,'features/df_tweet_train.pickle')
name = np.append(name,'ensemble')
bp = np.append(bp,'ensemble')
tn = np.append(tn,conf_matrix[0][0])
fp = np.append(fp,conf_matrix[0][1])
fn = np.append(fn,conf_matrix[1][0])
tp = np.append(tp,conf_matrix[1][1])
p = np.append(p,precision)
r = np.append(r,recall)
f_1 = np.append(f_1,f1)
auc_sc = np.append(auc_sc,auc_score)
acc = np.append(acc,accuracy)

class_eval['model'] = 'ensemble'
class_eval['file'] = 'features/df_tweet_eval.pickle'

df = df.append(class_eval) #merge the latest predictions for each tweet using this classifier
        




[[820  11]
 [ 38 152]]
precision = 0.9325153374233128
recall = 0.8
f1 = 0.8611898016997167
auc = 0.89338146811071
accuracy = 0.9520078354554359


  if diff:


## Save the results

In [10]:
df.to_pickle('results/classifier_results.pickle') #pickle the results of actual vs predicted for each tweet

classifications = pd.DataFrame({'tf':tf, #create a dataframe to hold the metrics
                                'name':name,
                               'bp':bp,
                               'tn':tn,
                               'fp':fp,
                               'fn':fn,
                               'tp':tp,
                               'p':p,
                               'r':r,
                               'f_1':f_1,
                               'auc_sc':auc_sc,
                               'acc':acc})
classifications.to_pickle('results/classifications.pickle') #pickle the metrics

## Print metric results descending by f1 value

In [11]:
classifications.sort_values(by='f_1', ascending=False)

Unnamed: 0,acc,auc_sc,bp,f_1,fn,fp,name,p,r,tf,tn,tp
19,0.952008,0.893381,ensemble,0.86119,38.0,11.0,ensemble,0.932515,0.8,features/df_tweet_train.pickle,820.0,152.0
1,0.950049,0.900298,C 0.05;penalty l2;random_state 42,0.859504,34.0,17.0,lr,0.901734,0.821053,features/df_tweet_tf_train.pickle,814.0,156.0
0,0.950049,0.900298,C 0.05;penalty l2;random_state 42,0.859504,34.0,17.0,lr,0.901734,0.821053,features/df_tweet_tfidf_train.pickle,814.0,156.0
2,0.947111,0.912702,C 1;penalty l2;random_state 42,0.857895,27.0,27.0,lr,0.857895,0.857895,features/df_tweet_train.pickle,804.0,163.0
3,0.941234,0.896912,C 0.05;penalty l2;random_state 42,0.839572,33.0,27.0,lr,0.853261,0.826316,features/df_tweetbio_tfidf_train.pickle,804.0,157.0
4,0.941234,0.896912,C 0.05;penalty l2;random_state 42,0.839572,33.0,27.0,lr,0.853261,0.826316,features/df_tweetbio_tf_train.pickle,804.0,157.0
8,0.933399,0.896159,criterion gini;random_state 42,0.823834,31.0,37.0,dt,0.811224,0.836842,features/df_tweet_train.pickle,794.0,159.0
18,0.935357,0.885183,C 0.05;kernel linear;random_state 42,0.822581,37.0,29.0,svm,0.840659,0.805263,features/df_tweet_train.pickle,802.0,153.0
5,0.932419,0.887437,C 0.1;penalty l2;random_state 42,0.817942,35.0,34.0,lr,0.820106,0.815789,features/df_tweetbio_train.pickle,797.0,155.0
12,0.924584,0.803458,criterion gini;random_state 42,0.750809,74.0,3.0,rf,0.97479,0.610526,features/df_tweet_tfidf_train.pickle,828.0,116.0


In [None]:
df.to_csv('df.csv')