# Multi-label classification of racial tweets:

In [1]:
#Get directory path
import os
os.chdir('/Users/valery/Documents/Springboard/Capstone_2') 
os.getcwd( )

'/Users/valery/Documents/Springboard/Capstone_2'

In [2]:
import pandas as pd

#Read in csv of cleaned data
df = pd.read_csv('df_tweets.csv', index_col=0)

#Create training set
df_train = df[df['tweet'].notnull()]
print(df_train.shape)
print('')
print(df_train.columns)
df_train.head()

(6481, 5)

Index(['tweetidg', 'tweet', 'positive', 'negative', 'neutral'], dtype='object')


Unnamed: 0,tweetidg,tweet,positive,negative,neutral
0,588687492888551424g,i am the type of nigga tryna get rich,0.0,0.0,1.0
1,592553981601304576g,sheblasiannn smuckers,0.0,0.0,1.0
2,717371522956984320g,proteinwisdom we even make japanese cars and e...,0.0,0.0,1.0
3,590292541125328898g,i wanted to see my nigga when i went back home...,0.0,1.0,0.0
4,592522044480225282g,wth stevo is that a nigga,0.0,0.0,1.0


In [3]:
#import packages for analysis and vizualization.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
#Import packages for algorithm analysis
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score

#Import algorithm packages.
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


### Create a table to examine the distribution of tags

In [5]:
df_train.columns

Index(['tweetidg', 'tweet', 'positive', 'negative', 'neutral'], dtype='object')

In [6]:
df_sparse = df_train.drop(['tweetidg', 'tweet'], axis=1)

counts = []

#Calculate the sum of tweets for each category.
categories = list(df_sparse.columns.values)
for cat in categories:
    numtwts = df_sparse[cat].sum()
    counts.append((cat, numtwts))

#Create dataframe to display the category counts and priors (starting probabilities).
df_stats = pd.DataFrame(counts, columns=['Category', 'Number_of_Tweets'])

#Calculate 
total = df_stats.Number_of_Tweets.sum()

#Calculate priors.
df_stats['priors'] = df_stats['Number_of_Tweets']/total

print(df_stats)
print('')
print('Total number of labels:  ', total)

   Category  Number_of_Tweets    priors
0  positive            1642.0  0.253356
1  negative            1977.0  0.305046
2   neutral            2862.0  0.441599

Total number of labels:   6481.0


## Multi-label classification

In [7]:
#Make sure all entries in df_matrix['text'] are strings
df_train['tweet'] = df_train['tweet'].astype(str)
categories = ['positive', 'negative', 'neutral']

In [8]:
#Import packages to vectorize and split the data
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

## Measuring accuracy for the MultinomialNB

In [9]:

vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=.4, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)

featurizer = TfidfVectorizer(stop_words='english',binary=False,max_df=.4,min_df=1, max_features=200)
Xv = vectorizer.fit_transform(df_train.tweet)
Xt = featurizer.fit_transform(df_train.tweet)


clf_NB = MultinomialNB()

print("CountVectorizer\n")

for category in categories:
    y = df_train[category].values.astype(np.int)
    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.30, random_state=42)

    clf_NB.fit(X_train,y_train)

    train_accuracy =clf_NB.score(X_train,y_train)
    test_accuracy = clf_NB.score(X_test, y_test)
    print(category, "training set accuracy: ", train_accuracy)
    print(category, "test set accuracy: ", test_accuracy, '\n')
    
print("\nTfidfVectorizer")

for category in categories:
    y = df_train[category].values.astype(np.int)
    X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.30, random_state=42)

    clf_NB.fit(X_train,y_train)

    train_accuracy =clf_NB.score(X_train,y_train)
    test_accuracy = clf_NB.score(X_test, y_test)
    print(category, "training set accuracy: ", train_accuracy)
    print(category, "test set accuracy: ", test_accuracy, '\n')
          

CountVectorizer

positive training set accuracy:  0.7654320987654321
positive test set accuracy:  0.7552699228791774 

negative training set accuracy:  0.7795414462081128
negative test set accuracy:  0.7588688946015424 

neutral training set accuracy:  0.6664462081128748
neutral test set accuracy:  0.6627249357326478 


TfidfVectorizer
positive training set accuracy:  0.7766754850088183
positive test set accuracy:  0.7660668380462725 

negative training set accuracy:  0.7954144620811288
negative test set accuracy:  0.7748071979434448 

neutral training set accuracy:  0.671957671957672
neutral test set accuracy:  0.6637532133676093 



This is looking good! But accuracy can be deceiving. High accuracy scores can still happen if your model detects many true negatives, but few true positives. **TfidfVectorizer** performed better for accuracy with the MultinomialNB.

High accuracy doesn't always mean high precision or recall. For this study, we want to decrease the probability of a Type 2 error - a false positive where we assign a label when it is not. For this, recall is a good measure to optimize because it is a measure of how correctly we assign a positive (value =  1) label to each category. It is a ratio of positive (value = 1) instances for each category that are correctly labeled by the classifier. 

## Tune & Train the Hyperparameters for Multinomial NB

In [10]:

i=1

vectors = [Xv, Xt]
for vect in vectors:
    
    conmatrxNB = []
    precisionNB = []
    recallNB = []
    f1NB = []

    clf_NB = MultinomialNB()

    alphas = [.00001,.0001, .001, .01, .1, 1, 5, 10, 50]

    # Create hyperparameter options
    hyperparameters = dict(alpha=alphas)

    # Create grid search using 5-fold cross validation
    clf_tune = GridSearchCV(clf_NB, hyperparameters, cv=5, scoring='recall')               


    for category in categories:
        # Fit grid search
        best_model = clf_tune.fit(vect, df_train[category])                
        alpha = best_model.best_estimator_.get_params()['alpha']
    
        clf_tunedNB = MultinomialNB(alpha=alpha)
    
        y_train_pred = cross_val_predict(clf_tunedNB, vect, df_train[category], cv=5)

    
        cmNB = confusion_matrix(df_train[category], y_train_pred)
        conmatrxNB.append(cmNB)
        precNB = precision_score(df_train[category], y_train_pred)    
        precisionNB.append(precNB)
        recNB = recall_score(df_train[category], y_train_pred)
        recallNB.append(recNB)
        fNB = f1_score(df_train[category], y_train_pred)
        f1NB.append(fNB)
        

    
    dictNB = {'Categories':categories, 'Confusion Matrix':conmatrxNB, 'Precision_NB':precisionNB, 
          'Recall_NB':recallNB, 'F1_NB':f1NB}


    df_NB = pd.DataFrame(dictNB) 
    #df_NB.to_csv('Results_NB_HBR.csv')

    if i == 1:
        print('CountVectorizer\n')
        i += 1
    else:
        print('\nTfidfVectorizer')
    
    print(df_NB)

CountVectorizer

  Categories             Confusion Matrix  Precision_NB  Recall_NB     F1_NB
0   positive   [[4150, 689], [1083, 559]]      0.447917   0.340438  0.386851
1   negative   [[3519, 985], [532, 1445]]      0.594650   0.730905  0.655775
2    neutral  [[3010, 609], [1613, 1249]]      0.672228   0.436408  0.529237

TfidfVectorizer
  Categories             Confusion Matrix  Precision_NB  Recall_NB     F1_NB
0   positive   [[4527, 312], [1278, 364]]      0.538462   0.221681  0.314064
1   negative   [[3797, 707], [753, 1224]]      0.633868   0.619120  0.626407
2    neutral  [[3031, 588], [1624, 1238]]      0.677985   0.432565  0.528157


The CountVectorizer gave the highest Recall scores with the tuned multinomialNB classifier. Below I will run the model again using CountVectorizer. This is to load the correct results in the Naive Bayes dataframe (df_NB). This dataframe will be used to compare the performance of the algorithms on this data.

In [11]:
#Run model with CountVectorizer and get AUC scores

conmatrxNB = []
precisionNB = []
recallNB = []
f1NB = []
aucNB = []
aveprecNB = []

clf_NB = MultinomialNB()

alphas = [.00001,.0001, .001, .01, .1, 1, 5, 10, 50]

# Create hyperparameter options
hyperparameters = dict(alpha=alphas)

# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_NB, hyperparameters, cv=5, scoring='recall')               


for category in categories:
    # Fit grid search
    best_model = clf_tune.fit(Xv, df_train[category])                
    alpha = best_model.best_estimator_.get_params()['alpha']
    
    clf_tunedNB = MultinomialNB(alpha=alpha)
    
    y_train_pred = cross_val_predict(clf_tunedNB, Xv, df_train[category], cv=5)

    cmNB = confusion_matrix(df_train[category], y_train_pred)
    conmatrxNB.append(cmNB)
    precNB = precision_score(df_train[category], y_train_pred)    
    precisionNB.append(precNB)
    recNB = recall_score(df_train[category], y_train_pred)
    recallNB.append(recNB)
    fNB = f1_score(df_train[category], y_train_pred)
    f1NB.append(fNB)
    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedNB.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedNB.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_nb = roc_auc_score(y_test, y_pred)
    aucNB.append(auc_nb)
    

dictNB = {'Categories':categories, 'Confusion Matrix':conmatrxNB, 'Precision_NB':precisionNB, 
          'Recall_NB':recallNB, 'F1_NB':f1NB, 'auc_NB':aucNB}


df_NB = pd.DataFrame(dictNB) 
#df_NB.to_csv('Results_NB_HBR.csv')
    
df_NB

Unnamed: 0,Categories,Confusion Matrix,Precision_NB,Recall_NB,F1_NB,auc_NB
0,positive,"[[4150, 689], [1083, 559]]",0.447917,0.340438,0.386851,0.617065
1,negative,"[[3519, 985], [532, 1445]]",0.59465,0.730905,0.655775,0.750151
2,neutral,"[[3010, 609], [1613, 1249]]",0.672228,0.436408,0.529237,0.63556


## Tune & Train the hyperparameters for LinearSVC

In [12]:
import warnings
warnings.filterwarnings("ignore")

i=1

vectors = [Xv, Xt]
for vect in vectors:

    conmatrxSVC = []
    precisionSVC = []
    recallSVC = []
    f1SVC = []
    
    clf_SVC = LinearSVC(random_state=42)

    loss = ['hinge', 'squared_hinge']
    
    C = [1, 10, 100, 1000, 10000]

    # Create hyperparameter options
    hyperparameters = dict(C=C, loss=loss)

    # Create grid search using 5-fold cross validation
    clf_tune = GridSearchCV(clf_SVC, hyperparameters, cv=5, scoring='recall')   

    for category in categories:
        # Fit grid search
        best_model = clf_tune.fit(vect, df_train[category])                
    
        SVCloss = best_model.best_estimator_.get_params()['loss']
        SVC_C = best_model.best_estimator_.get_params()['C']
    
        clf_tunedSVC = LinearSVC(random_state=42, loss=SVCloss, C=SVC_C)
    
        y_train_pred = cross_val_predict(clf_tunedSVC, vect, df_train[category], cv=5)
   
        cmSVC = confusion_matrix(df_train[category], y_train_pred)
        conmatrxSVC.append(cmSVC)
        precSVC = precision_score(df_train[category], y_train_pred)    
        precisionSVC.append(precSVC)
        recSVC = recall_score(df_train[category], y_train_pred)
        recallSVC.append(recSVC)
        fSVC = f1_score(df_train[category], y_train_pred)
        f1SVC.append(fSVC)
    
    dictSVC = {'Categories':categories, 'Confusion Matrix':conmatrxSVC, 'Precision_SVC':precisionSVC, 
           'Recall_SVC':recallSVC, 'F1_SVC':f1SVC}
    df_SVC = pd.DataFrame(dictSVC) 
    #df_SVC.to_csv('Results_SVC_HBR.csv')

    if i == 1:
        print('CountVectorizer\n')
        i += 1
    else:
        print('\nTfidfVectorizer')
    
    print(df_SVC)

CountVectorizer

  Categories             Confusion Matrix  Precision_SVC  Recall_SVC    F1_SVC
0   positive   [[3281, 1558], [869, 773]]       0.331617    0.470767  0.389127
1   negative  [[3111, 1393], [692, 1285]]       0.479836    0.649975  0.552095
2    neutral  [[1761, 1858], [928, 1934]]       0.510021    0.675751  0.581304

TfidfVectorizer
  Categories              Confusion Matrix  Precision_SVC  Recall_SVC  \
0   positive    [[3153, 1686], [865, 777]]       0.315469    0.473203   
1   negative   [[3075, 1429], [788, 1189]]       0.454163    0.601416   
2    neutral  [[1992, 1627], [1012, 1850]]       0.532068    0.646401   

     F1_SVC  
0  0.378563  
1  0.517519  
2  0.583688  


CountVectorizer gave slightly better recall scores for LinearSVC.

In [13]:
#Run model with CountVectorizer and get AUC scores

conmatrxSVC = []
precisionSVC = []
recallSVC = []
f1SVC = []
aucSVC = []

clf_SVC = LinearSVC(random_state=42)

loss = ['hinge', 'squared_hinge']
    
C = [1, 10, 100, 1000, 10000]

# Create hyperparameter options
hyperparameters = dict(C=C, loss=loss)

# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_SVC, hyperparameters, cv=5, scoring='recall')   


for category in categories:
    
    # Fit grid search
    best_model = clf_tune.fit(Xv, df_train[category])                
    
    SVCloss = best_model.best_estimator_.get_params()['loss']
    SVC_C = best_model.best_estimator_.get_params()['C']
    
    clf_tunedSVC = LinearSVC(random_state=42, loss=SVCloss, C=SVC_C)
    
    y_train_pred = cross_val_predict(clf_tunedSVC, Xv, df_train[category], cv=5)
   
    #Get algorithm performance measures
    cmSVC = confusion_matrix(df_train[category], y_train_pred)
    conmatrxSVC.append(cmSVC)
    precSVC = precision_score(df_train[category], y_train_pred)    
    precisionSVC.append(precSVC)
    recSVC = recall_score(df_train[category], y_train_pred)
    recallSVC.append(recSVC)
    fSVC = f1_score(df_train[category], y_train_pred)
    f1SVC.append(fSVC)
    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedSVC.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedSVC.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_svc = roc_auc_score(y_test, y_pred)
    aucSVC.append(auc_svc)
    
dictSVC = {'Categories':categories, 'Confusion Matrix':conmatrxSVC, 'Precision_SVC':precisionSVC, 
           'Recall_SVC':recallSVC, 'F1_SVC':f1SVC, 'auc_SVC':aucSVC}
df_SVC = pd.DataFrame(dictSVC) 
#df_SVC.to_csv('Results_SVC_HBR.csv')

df_SVC

Unnamed: 0,Categories,Confusion Matrix,Precision_SVC,Recall_SVC,F1_SVC,auc_SVC
0,positive,"[[3281, 1558], [869, 773]]",0.331617,0.470767,0.389127,0.558591
1,negative,"[[3111, 1393], [692, 1285]]",0.479836,0.649975,0.552095,0.659201
2,neutral,"[[1761, 1858], [928, 1934]]",0.510021,0.675751,0.581304,0.616927


## Tune & Train Hyperparameters for LR

All solvers support the l2 penalty (‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers do not support L1 penalty). So, hyperparameters are optimized using the L2 penalty.

In [14]:
i=1

vectors = [Xv, Xt]
for vect in vectors:

    conmatrxLR = []
    precisionLR = []
    recallLR = []
    f1LR = []

    clf_LR = LogisticRegression(random_state=42)
    
    # Create regularization hyperparameter space
    C = np.logspace(-4, 4, 20)

    solver = ['liblinear', 'sag', 'newton-cg', 'saga', 'lbfgs']

    # Create hyperparameter options
    hyperparameters = dict(solver = solver, C=C)

    # Create grid search using 5-fold cross validation
    clf_tuneLR = GridSearchCV(clf_LR, hyperparameters, cv=5, verbose=0, scoring='recall')       

    for category in categories:
    
        # Fit grid search
        best_model = clf_tuneLR.fit(vect, df_train[category])                
        LRsolver = best_model.best_estimator_.get_params()['solver']
        LR_C = best_model.best_estimator_.get_params()['C']      

        clf_tunedLR = LogisticRegression(penalty='l2', C=LR_C, solver=LRsolver)
    
        y_train_pred = cross_val_predict(clf_tunedLR, vect, df_train[category], cv=5)
    
        cmLR = confusion_matrix(df_train[category], y_train_pred)
        conmatrxLR.append(cmLR)
        precLR = precision_score(df_train[category], y_train_pred)    
        precisionLR.append(precLR)
        recLR = recall_score(df_train[category], y_train_pred)
        recallLR.append(recLR)
        fLR = f1_score(df_train[category], y_train_pred)
        f1LR.append(fLR)
        
        

    dictLR = {'Categories':categories, 'Confusion Matrix':conmatrxLR, 'Precision_LR':precisionLR, 
           'Recall_LR':recallLR, 'F1_LR':f1LR}
    df_LR = pd.DataFrame(dictLR) 
    #df_LR.to_csv('Results_LR_HBR.csv')
 
    if i == 1:
        print('CountVectorizer\n')
        i += 1
    else:
        print('\nTfidfVectorizer')
    
    print(df_LR)

CountVectorizer

  Categories             Confusion Matrix  Precision_LR  Recall_LR     F1_LR
0   positive   [[4464, 375], [1241, 401]]      0.516753   0.244214  0.331679
1   negative   [[4051, 453], [957, 1020]]      0.692464   0.515933  0.591304
2    neutral  [[2730, 889], [1169, 1693]]      0.655693   0.591544  0.621969

TfidfVectorizer
  Categories             Confusion Matrix  Precision_LR  Recall_LR     F1_LR
0   positive   [[4447, 392], [1235, 407]]      0.509387   0.247868  0.333470
1   negative   [[3935, 569], [834, 1143]]      0.667640   0.578149  0.619680
2    neutral  [[2786, 833], [1235, 1627]]      0.661382   0.568484  0.611424


2 out of 3 labels performed better with TfidfVectorizer.

In [15]:
#Run model with CountVectorizer and get AUC scores

conmatrxLR = []
precisionLR = []
recallLR = []
f1LR = []
aucLR = []

# Create regularization hyperparameter space
C = np.logspace(-4, 4, 20)

solver = ['liblinear', 'sag', 'newton-cg', 'saga', 'lbfgs']

# Create hyperparameter options
hyperparameters = dict(solver = solver, C=C)

# Create grid search using 5-fold cross validation
clf_tuneLR = GridSearchCV(clf_LR, hyperparameters, cv=5, verbose=0, scoring='recall')       

for category in categories:
    
    # Fit grid search
    best_model = clf_tuneLR.fit(Xt, df_train[category])                
    LRsolver = best_model.best_estimator_.get_params()['solver']
    LR_C = best_model.best_estimator_.get_params()['C']      

    clf_tunedLR = LogisticRegression(penalty='l2', C=LR_C, solver=LRsolver)
    
    y_train_pred = cross_val_predict(clf_tunedLR, Xt, df_train[category], cv=5)
    
    cmLR = confusion_matrix(df_train[category], y_train_pred)
    conmatrxLR.append(cmLR)
    precLR = precision_score(df_train[category], y_train_pred)    
    precisionLR.append(precLR)
    recLR = recall_score(df_train[category], y_train_pred)
    recallLR.append(recLR)
    fLR = f1_score(df_train[category], y_train_pred)
    f1LR.append(fLR)
    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedLR.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedLR.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_lr = roc_auc_score(y_test, y_pred)
    aucLR.append(auc_lr)

dictLR = {'Categories':categories, 'Confusion Matrix':conmatrxLR, 'Precision_LR':precisionLR, 
           'Recall_LR':recallLR, 'F1_LR':f1LR, 'auc_LR':aucLR}
df_LR = pd.DataFrame(dictLR) 
#df_LR.to_csv('Results_LR.csv')

df_LR

Unnamed: 0,Categories,Confusion Matrix,Precision_LR,Recall_LR,F1_LR,auc_LR
0,positive,"[[4447, 392], [1235, 407]]",0.509387,0.247868,0.33347,0.578982
1,negative,"[[3935, 569], [834, 1143]]",0.66764,0.578149,0.61968,0.718866
2,neutral,"[[2786, 833], [1235, 1627]]",0.661382,0.568484,0.611424,0.67209


## Tune & Train Hyperparameters for RandomForest

In [16]:
i=1

vectors = [Xv, Xt]
for vect in vectors:

    conmatrxRF = []
    precisionRF = []
    recallRF = []
    f1RF = []

    clf_RF = RandomForestClassifier(random_state=42)

    n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]

    # Create hyperparameter options
    hyperparameters = dict(n_estimators=n_estimators)


    # Create grid search using 5-fold cross validation
    clf_tune = GridSearchCV(clf_RF, hyperparameters, cv=5, scoring='recall')

    for category in categories:
        # Fit grid search
        best_model = clf_tune.fit(vect, df_train[category]) 
        RFestimators = best_model.best_estimator_.get_params()['n_estimators']
                
        clf_tunedRF = RandomForestClassifier(random_state=42, n_estimators=RFestimators)
    
        y_train_pred = cross_val_predict(clf_tunedRF, vect, df_train[category], cv=5)

    
        cmRF = confusion_matrix(df_train[category], y_train_pred)
        conmatrxRF.append(cmRF)
        precRF = precision_score(df_train[category], y_train_pred)    
        precisionRF.append(precRF)
        recRF = recall_score(df_train[category], y_train_pred)
        recallRF.append(recRF)
        fRF = f1_score(df_train[category], y_train_pred)
        f1RF.append(fRF)

    
    dictRF = {'Categories':categories, 'Confusion Matrix':conmatrxRF, 'Precision_RF':precisionRF, 
           'Recall_RF':recallRF, 'F1_RF':f1RF}
    df_RF = pd.DataFrame(dictRF) 
    #df_RF.to_csv('Results_RF.csv')

    if i == 1:
        print('CountVectorizer\n')
        i += 1
    else:
        print('\nTfidfVectorizer')
    
    print(df_RF)

CountVectorizer

  Categories              Confusion Matrix  Precision_RF  Recall_RF     F1_RF
0   positive    [[4058, 781], [1151, 491]]      0.386006   0.299026  0.336994
1   negative    [[3801, 703], [841, 1136]]      0.617727   0.574608  0.595388
2    neutral  [[2475, 1144], [1136, 1726]]      0.601394   0.603075  0.602233

TfidfVectorizer
  Categories              Confusion Matrix  Precision_RF  Recall_RF     F1_RF
0   positive    [[4061, 778], [1136, 506]]      0.394081   0.308161  0.345865
1   negative    [[3834, 670], [843, 1134]]      0.628603   0.573596  0.599841
2    neutral  [[2584, 1035], [1249, 1613]]      0.609139   0.563592  0.585481


CountVectorizer gave slightly better results for RandomForestClassifier.

In [17]:
#Run model with CountVectorizer and get AUC scores

conmatrxRF = []
precisionRF = []
recallRF = []
f1RF = []
aucRF = []

clf_RF = RandomForestClassifier(random_state=42)

n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]

# Create hyperparameter options
hyperparameters = dict(n_estimators=n_estimators)


# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_RF, hyperparameters, cv=5, scoring='recall')

for category in categories:
    # Fit grid search
    best_model = clf_tune.fit(Xv, df_train[category]) 
    RFestimators = best_model.best_estimator_.get_params()['n_estimators']
                
    clf_tunedRF = RandomForestClassifier(random_state=42, n_estimators=RFestimators)
    
    y_train_pred = cross_val_predict(clf_tunedRF, Xv, df_train[category], cv=5)

    
    cmRF = confusion_matrix(df_train[category], y_train_pred)
    conmatrxRF.append(cmRF)
    precRF = precision_score(df_train[category], y_train_pred)    
    precisionRF.append(precRF)
    recRF = recall_score(df_train[category], y_train_pred)
    recallRF.append(recRF)
    fRF = f1_score(df_train[category], y_train_pred)
    f1RF.append(fRF)
    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedRF.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedRF.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_rf = roc_auc_score(y_test, y_pred)
    aucRF.append(auc_rf)

    
dictRF = {'Categories':categories, 'Confusion Matrix':conmatrxRF, 'Precision_RF':precisionRF, 
           'Recall_RF':recallRF, 'F1_RF':f1RF, 'auc_RF':aucRF}
df_RF = pd.DataFrame(dictRF) 
#df_RF.to_csv('Results_RF_HBR.csv')

df_RF

Unnamed: 0,Categories,Confusion Matrix,Precision_RF,Recall_RF,F1_RF,auc_RF
0,positive,"[[4058, 781], [1151, 491]]",0.386006,0.299026,0.336994,0.570422
1,negative,"[[3801, 703], [841, 1136]]",0.617727,0.574608,0.595388,0.713677
2,neutral,"[[2475, 1144], [1136, 1726]]",0.601394,0.603075,0.602233,0.629962


## Tune & Train Hyperparameters for Gradient Boosting

In [18]:
i=1

vectors = [Xv, Xt]
for vect in vectors:

    conmatrxGRD = []
    precisionGRD = []
    recallGRD = []
    f1GRD = []

    clf_GRD = GradientBoostingClassifier(random_state=42)

    loss=['deviance', 'exponential']
    n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]


    # Create hyperparameter options
    hyperparameters = dict(loss=loss, n_estimators=n_estimators)


    # Create grid search using 5-fold cross validation
    clf_tune = GridSearchCV(clf_GRD, hyperparameters, cv=5, scoring='recall')

    for category in categories:

        # Fit grid search
        best_model = clf_tune.fit(vect, df_train[category]) 
        GRDloss = best_model.best_estimator_.get_params()['loss']
        GRDestimators = best_model.best_estimator_.get_params()['n_estimators']
                 
 
        clf_tunedGRD= GradientBoostingClassifier(random_state=42, loss=GRDloss, n_estimators=GRDestimators)
    
        y_train_pred = cross_val_predict(clf_tunedGRD, vect, df_train[category], cv=5)
    
        cmGRD = confusion_matrix(df_train[category], y_train_pred)
        conmatrxGRD.append(cmGRD)
        precGRD = precision_score(df_train[category], y_train_pred)    
        precisionGRD.append(precGRD)
        recGRD = recall_score(df_train[category], y_train_pred)
        recallGRD.append(recGRD)
        fGRD = f1_score(df_train[category], y_train_pred)
        f1GRD.append(fGRD)

    
    dictGRD = {'Categories':categories, 'Confusion Matrix':conmatrxGRD, 'Precision_GRD':precisionGRD, 
           'Recall_GRD':recallGRD, 'F1_GRD':f1GRD}
    df_GRD = pd.DataFrame(dictGRD) 
    #df_GRDt.to_csv('Results_GRD.csv')

    if i == 1:
        print('CountVectorizer\n')
        i += 1
    else:
        print('\nTfidfVectorizer')
    
    print(df_GRD)
    

CountVectorizer

  Categories             Confusion Matrix  Precision_GRD  Recall_GRD    F1_GRD
0   positive   [[4559, 280], [1329, 313]]       0.527825    0.190621  0.280089
1   negative   [[4088, 416], [1018, 959]]       0.697455    0.485078  0.572196
2    neutral  [[2321, 1298], [941, 1921]]       0.596769    0.671209  0.631804

TfidfVectorizer
  Categories             Confusion Matrix  Precision_GRD  Recall_GRD    F1_GRD
0   positive   [[4610, 229], [1328, 314]]       0.578269    0.191230  0.287414
1   negative   [[4049, 455], [1000, 977]]       0.682263    0.494183  0.573189
2    neutral  [[2246, 1373], [923, 1939]]       0.585447    0.677498  0.628118


The best recall is with TfidfVectorizer.

In [19]:
conmatrxGRD = []
precisionGRD = []
recallGRD = []
f1GRD = []
aucGRD = []

clf_GRD = GradientBoostingClassifier(random_state=42)

loss=['deviance', 'exponential']
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]


# Create hyperparameter options
hyperparameters = dict(loss=loss, n_estimators=n_estimators)


# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_GRD, hyperparameters, cv=5, scoring='recall')

for category in categories:

    # Fit grid search
    best_model = clf_tune.fit(Xt, df_train[category]) 
    GRDloss = best_model.best_estimator_.get_params()['loss']
    GRDestimators = best_model.best_estimator_.get_params()['n_estimators']
                 
 
    clf_tunedGRD= GradientBoostingClassifier(random_state=42, loss=GRDloss, n_estimators=GRDestimators)
    
    y_train_pred = cross_val_predict(clf_tunedGRD, Xt, df_train[category], cv=5)
    
    cmGRD = confusion_matrix(df_train[category], y_train_pred)
    conmatrxGRD.append(cmGRD)
    precGRD = precision_score(df_train[category], y_train_pred)    
    precisionGRD.append(precGRD)
    recGRD = recall_score(df_train[category], y_train_pred)
    recallGRD.append(recGRD)
    fGRD = f1_score(df_train[category], y_train_pred)
    f1GRD.append(fGRD)

    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedGRD.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedGRD.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc_grd = roc_auc_score(y_test, y_pred)
    aucGRD.append(auc_grd)

    
dictGRD = {'Categories':categories, 'Confusion Matrix':conmatrxGRD, 'Precision_GRD':precisionGRD, 
           'Recall_GRD':recallGRD, 'F1_GRD':f1GRD, 'auc_GRD':aucGRD}
df_GRD = pd.DataFrame(dictGRD) 
#df_GRD.to_csv('Results_GRD_HBR.csv')

df_GRD

Unnamed: 0,Categories,Confusion Matrix,Precision_GRD,Recall_GRD,F1_GRD,auc_GRD
0,positive,"[[4610, 229], [1328, 314]]",0.578269,0.19123,0.287414,0.571353
1,negative,"[[4049, 455], [1000, 977]]",0.682263,0.494183,0.573189,0.699363
2,neutral,"[[2246, 1373], [923, 1939]]",0.585447,0.677498,0.628118,0.653992


In [20]:
df_prec1 = pd.DataFrame(columns = ['Categories', 'Precision_NB', 'Precision_SVC', 
                                   'Precision_LR', 'Precision_RF', 'Precision_GRD', 'Max_Precision'])
df_prec1.Categories = df_NB.Categories
df_prec1.Precision_NB = df_NB.Precision_NB
df_prec1.Precision_SVC  = df_SVC.Precision_SVC
df_prec1.Precision_LR  = df_LR.Precision_LR
df_prec1.Precision_RF = df_RF.Precision_RF
df_prec1.Precision_GRD = df_GRD.Precision_GRD


df_prec1.Max_Precision = df_prec1.max(axis=1)

#df_prec1.to_csv('/Users/valery/Documents/Springboard/Capstone_2/Pretuned_Results/Pretuned_Precision.csv')

df_prec1

Unnamed: 0,Categories,Precision_NB,Precision_SVC,Precision_LR,Precision_RF,Precision_GRD,Max_Precision
0,positive,0.447917,0.331617,0.509387,0.386006,0.578269,0.578269
1,negative,0.59465,0.479836,0.66764,0.617727,0.682263,0.682263
2,neutral,0.672228,0.510021,0.661382,0.601394,0.585447,0.672228


In [21]:
df_rec = pd.DataFrame(columns = ['Categories', 'Recall_NB', 'Recall_SVC', 
                                   'Recall_LR', 'Recall_RF', 'Recall_GRD','Max_Recall'])
df_rec.Categories = df_NB.Categories
df_rec.Recall_NB = df_NB.Recall_NB
df_rec.Recall_SVC  = df_SVC.Recall_SVC
df_rec.Recall_LR  = df_LR.Recall_LR
df_rec.Recall_RF = df_RF.Recall_RF
df_rec.Recall_GRD = df_GRD.Recall_GRD
df_rec.Max_Recall = df_rec.max(axis=1)

#df_rec.to_csv('/Users/valery/Documents/Springboard/Capstone_2/Pretuned_Results/Pretuned_Recall.csv')

df_rec

Unnamed: 0,Categories,Recall_NB,Recall_SVC,Recall_LR,Recall_RF,Recall_GRD,Max_Recall
0,positive,0.340438,0.470767,0.247868,0.299026,0.19123,0.470767
1,negative,0.730905,0.649975,0.578149,0.574608,0.494183,0.730905
2,neutral,0.436408,0.675751,0.568484,0.603075,0.677498,0.677498


In [22]:
df_f1 = pd.DataFrame(columns = ['Categories', 'F1_NB', 'F1_SVC', 
                                   'F1_LR', 'F1_RF','F1_GRD','Max_F1'])
df_f1.Categories = df_NB.Categories
df_f1.F1_NB = df_NB.F1_NB
df_f1.F1_SVC  = df_SVC.F1_SVC
df_f1.F1_LR  = df_LR.F1_LR
df_f1.F1_RF  = df_RF.F1_RF
df_f1.F1_GRD  = df_GRD.F1_GRD
df_f1.Max_F1 = df_f1.max(axis=1)

#df_f1.to_csv('/Users/valery/Documents/Springboard/Capstone_2/Pretuned_Results/Pretuned_f1.csv')

df_f1

Unnamed: 0,Categories,F1_NB,F1_SVC,F1_LR,F1_RF,F1_GRD,Max_F1
0,positive,0.386851,0.389127,0.33347,0.336994,0.287414,0.389127
1,negative,0.655775,0.552095,0.61968,0.595388,0.573189,0.655775
2,neutral,0.529237,0.581304,0.611424,0.602233,0.628118,0.628118


In [23]:
df_auc = pd.DataFrame(columns = ['Categories', 'auc_NB', 'auc_SVC', 
                                   'auc_LR', 'auc_RF','auc_GRD','Max_auc'])
df_auc.Categories = df_NB.Categories
df_auc.auc_NB = df_NB.auc_NB
df_auc.auc_SVC  = df_SVC.auc_SVC
df_auc.auc_LR  = df_LR.auc_LR
df_auc.auc_RF  = df_RF.auc_RF
df_auc.auc_GRD  = df_GRD.auc_GRD
df_auc.Max_auc = df_auc.max(axis=1)

#df_auc.to_csv('/Users/valery/Documents/Springboard/Capstone_2/Pretuned_Results/Pretuned_f1.csv')

df_auc

Unnamed: 0,Categories,auc_NB,auc_SVC,auc_LR,auc_RF,auc_GRD,Max_auc
0,positive,0.617065,0.558591,0.578982,0.570422,0.571353,0.617065
1,negative,0.750151,0.659201,0.718866,0.713677,0.699363,0.750151
2,neutral,0.63556,0.616927,0.67209,0.629962,0.653992,0.67209


The best auc scores for the positive and negative categories are with the tuned MultinomialNB. The tuned LogisticRegressionClassifier has the best auc scores for the neutral category. 

## Model Selection

I am interested in classsifying the highest number of positive labels for each category (note that here we are using 'positive' to mean a classification of '1' versus '0', not the 'positive' label for race sentiment. For this, I maximized recall, sometimes called sensitivity, which is the true positive rate. Recall is the ratio of positive instances that are correctly detected by the classification algorithm.

$Recall = \frac{TP}{TP + FN}$

This is contrasted with precision, which is the accuracy of the positive predictions.

$Precision = \frac{TP}{TP + FP}$

If we were concerned only with getting the highest accuracy possible, we would tune on precision, but the cost would be that we would predict much fewer (albeit correct) positive instances. Tuning for recall will lower the accuracy, but we will lower the risk of leaving some positive instances out. The difficulty is that there is a tradeoff between recall and precision. Increasing one will reduce the other. The F1 score gives a picture of how well the classifier performs for each. It is the harmonic mean between precision and recall. This score is high for classifiers that have similar precision and recall. 

$F1 = 2\frac{P \times R}{P+R}$

Finally, the area under the ROC curve (AUC) is a way to compare classifiers using above metrics. It is the area under the curve plotted as the True Positive Rate (Recall) against the False Positive Rate (the ratio of the negative instances ('0') that are incorrectly classified as positive ('1'). A perfect classifier will have an AUC = 1, and one that is purely random will equal 0.5. So, we are looking for the highest AUC score above 0.5. 

Since our data is highy unbalanced for each category, there will be far more negative instances than positive instances, the ROC curve isn't the best metric to use. Instead, it is better to use a Precision-Recall Curve. However, this is not easily done in Sklearn with every classifier. It depends on calculating the decision_function, which isn't available for every classifier (e.g., MultinomialNB or RandomForestClassifier).

To evaluate these classifiers I need a metric that is shared among them all. Therefore I will look at the Recall, F1 and the AUC scores. Best overall score is given to the best out of 3. In the case of a tie, it will go to best recall.


|Category|Best Recall|Best F1|Best AUC|Best Overall|
|--------|-----------|-------|--------|------------|
|Positive|LinearSVC|LinearSVC|MultinomialNB|LinearSVC|
|Negative|MultinomialNB|MultinomialNB|MultinomialNB|MultinomialNB|
|Neutral|GradientBoostingClassifier|GradientBoostingClassifier|LogisticRegression|GradientBoostingClassifier|

We want to predict the highest number of tweets in each category and are willing to accept a few incorrect classifications in order to catch all the correct classifications. These metrics attend to both precision and recall so that the overall choice of classifiers for each category will minimize false negative claims.

## Final training with AUC scores

In [24]:

clf_SVC = LinearSVC(random_state=42)

loss = ['hinge', 'squared_hinge']
    
C = [1, 10, 100, 1000, 10000]

# Create hyperparameter options
hyperparameters = dict(C=C, loss=loss)

# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_SVC, hyperparameters, cv=5, scoring='recall')   

category = ['positive']

for cat in category:
    # Fit grid search
    best_model = clf_tune.fit(Xv, df_train[cat])                
    
    SVCloss = best_model.best_estimator_.get_params()['loss']
    SVC_C = best_model.best_estimator_.get_params()['C']
    
    clf_tunedSVC = LinearSVC(random_state=42, loss=SVCloss, C=SVC_C)

    y = df_train[category].values.astype(np.int)



    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.3)

    # Supervised transformation
    #clf_tunedSVC = LinearSVC(random_state=42, loss='squared_hinge', C=1000)
    clf_tunedSVC.fit(X_train, y_train)
    
    y_pred = clf_tunedSVC.predict(X_test)
    fprSVC, tprSVC, _ = roc_curve(y_test, y_pred)
    aucSVC = roc_auc_score(y_test, y_pred)
    
    print(cat)
    print(aucSVC, '\n')

positive
0.5418285278843047 



In [25]:

clf_NB = MultinomialNB()

alphas = [.00001,.0001, .001, .01, .1, 1, 5, 10, 50]

# Create hyperparameter options
hyperparameters = dict(alpha=alphas)

# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_NB, hyperparameters, cv=5, scoring='recall')               

category = ['negative']

for cat in category:
    # Fit grid search
    best_model = clf_tune.fit(Xv, df_train[cat])                
    alpha = best_model.best_estimator_.get_params()['alpha']
    
    clf_tunedNB = MultinomialNB(alpha=alpha)

    y = df_train[cat].values.astype(np.int)

    X_train, X_test, y_train, y_test = train_test_split(Xv, y, test_size=0.3)

    # Supervised transformation based on gradient boosted trees
    clf_tunedNB.fit(X_train, y_train)
    
    # Prediction and predicted AUC score
    y_pred = clf_tunedNB.predict(X_test)
    fprNB, tprNB, _ = roc_curve(y_test, y_pred)
    aucNB = roc_auc_score(y_test, y_pred)
    
    print(cat)
    print(aucNB, '\n')

negative
0.7392679221283962 



In [26]:
clf_GRD = GradientBoostingClassifier(random_state=42)

loss=['deviance', 'exponential']
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]

# Create hyperparameter options
hyperparameters = dict(loss=loss, n_estimators=n_estimators)

# Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_GRD, hyperparameters, cv=5, scoring='recall')

category = ['neutral']

for cat in category:
    # Fit grid search
    best_model = clf_tune.fit(Xt, df_train[category]) 
    GRDloss = best_model.best_estimator_.get_params()['loss']
    GRDestimators = best_model.best_estimator_.get_params()['n_estimators']
                 
 
    clf_tunedGRD= GradientBoostingClassifier(random_state=42, loss=GRDloss, n_estimators=GRDestimators)
    
    #Make training and testing data.
    y = df_train[category].values.astype(np.int)
    
    X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.3)
    
    #Fit the tuned classifier to the training data.
    clf_tunedGRD.fit(X_train, y_train)

    #Calculate AUC score with the test set
    y_pred = clf_tunedGRD.predict(X_test)
    fprGRD, tprGRD, _ = roc_curve(y_test, y_pred)
    aucGRD = roc_auc_score(y_test, y_pred)
   
    print(cat)
    print(aucGRD, '\n')

neutral
0.6475611066344344 



## Prediction