# Multi-label classification of racial tweets:

In [1]:
import pandas as pd
#Read in csv of cleaned data
df = pd.read_csv('df_tweets.csv', index_col=0)

#Create training set

df_train = df[df['tweet'].notnull()]
print(df_train.shape)
print('')
print(df_train.columns)
df_train.head()

(6481, 5)

Index(['tweetidg', 'tweet', 'positive', 'negative', 'neutral'], dtype='object')


Unnamed: 0,tweetidg,tweet,positive,negative,neutral
0,588687492888551424g,i am the type of nigga tryna get rich,0.0,0.0,1.0
1,592553981601304576g,sheblasiannn smuckers,0.0,0.0,1.0
2,717371522956984320g,proteinwisdom we even make japanese cars and e...,0.0,0.0,1.0
3,590292541125328898g,i wanted to see my nigga when i went back home...,0.0,1.0,0.0
4,592522044480225282g,wth stevo is that a nigga,0.0,0.0,1.0


In [2]:
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

### Create a table to examine the distribution of tags

In [3]:

df_sparse = df_train.drop(['tweetidg', 'tweet'], axis=1)

counts = []
priors = []
categories = list(df_sparse.columns.values)
for i in categories:
    numarts = df_sparse[i].sum()
    counts.append((i, numarts))

df_stats = pd.DataFrame(counts, columns=['Category', 'Number_of_Tweets'])
total = df_stats.Number_of_Tweets.sum()

df_stats['priors'] = df_stats['Number_of_Tweets']/total

print(df_stats)
print('')
print('Total number of labels:  ', total)

   Category  Number_of_Tweets    priors
0  positive            1642.0  0.253356
1  negative            1977.0  0.305046
2   neutral            2862.0  0.441599

Total number of labels:   6481.0


## Multi-label classification

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
df_train.columns

Index(['tweetidg', 'tweet', 'positive', 'negative', 'neutral'], dtype='object')

In [6]:
#Make sure all entries in df_matrix['text'] are strings
df_train['tweet'] = df_train['tweet'].astype(str)


from sklearn.feature_extraction.text import CountVectorizer

#Function for positive labels
def make_x(df, vectorizer=None):
    if vectorizer is None:
        vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)
    X = vectorizer.fit_transform(df_train.tweet)
    
    #X = X.tocsc()  # some versions of sklearn return COO format
    return type(X)
#Call the function: X
make_x(df_train)



#Function for negative labels
def make_xy_neg(df, vectorizer=None):
    if vectorizer is None:
        vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=25,
                                     min_df=1,strip_accents='unicode',max_features=100)
    X = vectorizer.fit_transform(df_train.tweet)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df_train['negative'].values.astype(np.int)
    return X, y



#Function for neutral labels
def make_xy_neut(df, vectorizer=None):
    if vectorizer is None:
        vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=25,
                                     min_df=1,strip_accents='unicode',max_features=100)
    X = vectorizer.fit_transform(df_train.tweet)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df_train['neutral'].values.astype(np.int)
    return X


In [7]:
#Vectorize the data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)

featurizer = TfidfVectorizer(stop_words='english',binary=False,max_df=50,min_df=1)
X = featurizer.fit_transform(df_train.tweet)

## Multinomial Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

clf_NB = MultinomialNB()

#Positive scores
yp = df_train['positive'].values.astype(np.int)
X_trainp, X_testp, y_trainp, y_testp = train_test_split(X, yp, test_size=0.30, random_state=42)

clf_NB.fit(X_trainp,y_trainp)

train_accuracy_pos =clf_NB.score(X_trainp,y_trainp)
test_accuracy_pos = clf_NB.score(X_testp, y_testp)
print("Positive - training set accuracy: ", train_accuracy_pos)
print("Positive - test set accuracy: ", test_accuracy_pos, '\n')
        
#Negative scores
yn = df_train['negative'].values.astype(np.int)
X_trainn, X_testn, y_trainn, y_testn = train_test_split(X, yn, test_size=0.30, random_state=42)

clf_NB.fit(X_trainn,y_trainn)

train_accuracy_neg = clf_NB.score(X_trainn, y_trainn)
test_accuracy_neg = clf_NB.score(X_testn, y_testn)
print("Negative - training set accuracy: ", train_accuracy_neg)
print("Negative - test set accuracy: ", test_accuracy_neg, '\n')
        
#Neutral scores
yne = df_train['neutral'].values.astype(np.int)

X_train, X_test, y_train, y_test = train_test_split(X, yne, test_size=0.30, random_state=42)

clf_NB.fit(X_train,y_train)

train_accuracy_neut = clf_NB.score(X_train, y_train)
test_accuracy_neut = clf_NB.score(X_test, y_test)
print("Neutral - training set accuracy: ", train_accuracy_neut)
print("Neutral - test set accuracy: ", test_accuracy_neut)   

Positive - training set accuracy:  0.8115079365079365
Positive - test set accuracy:  0.744987146529563 

Negative - training set accuracy:  0.8968253968253969
Negative - test set accuracy:  0.7491002570694087 

Neutral - training set accuracy:  0.9126984126984127
Neutral - test set accuracy:  0.6519280205655527


This is looking good! But accuracy can be deceiving. High accuracy scores can still happen if your model detects many true negatives, but no true positives! Let's check:

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score


#Prediciton with NB
conmatrxNB = []
precisionNB = []
recallNB = []
f1NB = []


for category in categories:

    y_train_pred = cross_val_predict(clf_NB, X, df_train[category], cv=5)
    
    cmNB = confusion_matrix(df_train[category], y_train_pred)
    conmatrxNB.append(cmNB)
    precNB = precision_score(df_train[category], y_train_pred)    
    precisionNB.append(precNB)
    recNB = recall_score(df_train[category], y_train_pred)
    recallNB.append(recNB)
    fNB = f1_score(df_train[category], y_train_pred)
    f1NB.append(fNB)

    
dictNB = {'Categories':categories, 'Confusion Matrix':conmatrxNB, 'Precision_NB':precisionNB, 'Recall_NB':recallNB, 'F1_NB':f1NB}
df_NB = pd.DataFrame(dictNB) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_NB_tweets = df_NB[df_NB['Precision_NB'] >= 0]
 
df_NB_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_NB,Recall_NB,F1_NB
0,positive,"[[4822, 17], [1599, 43]]",0.716667,0.026188,0.050529
1,negative,"[[4314, 190], [1384, 593]]",0.757344,0.299949,0.42971
2,neutral,"[[3207, 412], [1886, 976]]",0.70317,0.34102,0.459294


As I expected, high accuracy doesn't always mean high precision or recall. We can see that these are lower. I will run the other models first, then tune parameters on the one showing the most promise for this data. Note that for the neutral label, accuracy was lower than precision. For this study, we want to decrease the probability of a Type 2 error - a false positive where we say there is an effect but there is not. Precision is the best score to tune because it is a measure of how sure we are that a label is a true label (true positive) if our algorithm classifies it as such. After running the other two classifiers, I will tune the hyper-parameters for the best performing classifier at these default settings. 

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

clf_SVC = OneVsRestClassifier(LinearSVC(), n_jobs=1)

#Positive scores
yp = df_train['positive'].values.astype(np.int)
X_trainp, X_testp, y_trainp, y_testp = train_test_split(X, yp, test_size=0.30, random_state=42)

clf_SVC.fit(X_trainp,y_trainp)

train_accuracy_pos =clf_SVC.score(X_trainp,y_trainp)
test_accuracy_pos = clf_SVC.score(X_testp, y_testp)
print("Positive - training set accuracy: ", train_accuracy_pos)
print("Positive - test set accuracy: ", test_accuracy_pos, '\n')
        
#Negative scores
yn = df_train['negative'].values.astype(np.int)
X_trainn, X_testn, y_trainn, y_testn = train_test_split(X, yn, test_size=0.30, random_state=42)

clf_SVC.fit(X_trainn,y_trainn)

train_accuracy_neg = clf_SVC.score(X_trainn, y_trainn)
test_accuracy_neg = clf_SVC.score(X_testn, y_testn)
print("Negative - training set accuracy: ", train_accuracy_neg)
print("Negative - test set accuracy: ", test_accuracy_neg, '\n')
        
#Neutral scores
yne = df_train['neutral'].values.astype(np.int)

X_train, X_test, y_train, y_test = train_test_split(X, yne, test_size=0.30, random_state=42)

clf_SVC.fit(X_train,y_train)

train_accuracy_neut = clf_SVC.score(X_train, y_train)
test_accuracy_neut = clf_SVC.score(X_test, y_test)
print("Neutral - training set accuracy: ", train_accuracy_neut)
print("Neutral - test set accuracy: ", test_accuracy_neut)   

## LinearSVC

In [10]:
clf_SVC = LinearSVC()


#Prediciton with NB
conmatrxSVC = []
precisionSVC = []
recallSVC = []
f1SVC = []



for category in categories:

    y_train_pred = cross_val_predict(clf_SVC, X, df_train[category], cv=5)
    
    cmSVC = confusion_matrix(df_train[category], y_train_pred)
    conmatrxSVC.append(cmSVC)
    precSVC = precision_score(df_train[category], y_train_pred)    
    precisionSVC.append(precSVC)
    recSVC = recall_score(df_train[category], y_train_pred)
    recallSVC.append(recSVC)
    fSVC = f1_score(df_train[category], y_train_pred)
    f1SVC.append(fSVC)

    
dictSVC = {'Categories':categories, 'Confusion Matrix':conmatrxSVC, 'Precision_SVC':precisionSVC, 
           'Recall_SVC':recallSVC, 'F1_SVC':f1SVC}
df_SVC = pd.DataFrame(dictSVC) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_SVC_tweets = df_SVC[df_SVC['Precision_SVC'] >= 0]
 
df_SVC_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_SVC,Recall_SVC,F1_SVC
0,positive,"[[4523, 316], [1313, 329]]",0.510078,0.200365,0.287713
1,negative,"[[4045, 459], [1050, 927]]",0.668831,0.468892,0.551293
2,neutral,"[[2801, 818], [1603, 1259]]",0.606163,0.439902,0.50982


## Logistic Regression

In [11]:
clf_LR = LogisticRegression()

#vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     #min_df=1,strip_accents='unicode',max_features=200)
#featurizer = TfidfVectorizer(stop_words='english',binary=False,max_df=50,min_df=1)
#X = vectorizer.fit_transform(df_train.tweet)
import warnings
warnings.filterwarnings("ignore")

#Prediction with NB
conmatrxLR = []
precisionLR = []
recallLR = []
f1LR = []



for category in categories:

    y_train_pred = cross_val_predict(clf_LR, X, df_train[category], cv=5)
    
    cmLR = confusion_matrix(df_train[category], y_train_pred)
    conmatrxLR.append(cmLR)
    precLR = precision_score(df_train[category], y_train_pred)    
    precisionLR.append(precLR)
    recLR = recall_score(df_train[category], y_train_pred)
    recallLR.append(recLR)
    fLR = f1_score(df_train[category], y_train_pred)
    f1LR.append(fLR)

    
dictLR = {'Categories':categories, 'Confusion Matrix':conmatrxLR, 'Precision_LR':precisionLR,'Recall_LR':recallLR, 'F1_LR':f1LR}
df_LR = pd.DataFrame(dictLR) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_LR_tweets = df_LR[df_LR['Precision_LR'] >= 0]
 
df_LR_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_LR,Recall_LR,F1_LR
0,positive,"[[4827, 12], [1585, 57]]",0.826087,0.034714,0.066628
1,negative,"[[4369, 135], [1482, 495]]",0.785714,0.250379,0.379747
2,neutral,"[[3226, 393], [1992, 870]]",0.688836,0.303983,0.421818


## Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier()


#Prediciton with NB
conmatrxRF = []
precisionRF = []
recallRF = []
f1RF = []



for category in categories:

    y_train_pred = cross_val_predict(clf_RF, X, df_train[category], cv=5)
    
    cmRF = confusion_matrix(df_train[category], y_train_pred)
    conmatrxRF.append(cmRF)
    precRF = precision_score(df_train[category], y_train_pred)    
    precisionRF.append(precRF)
    recRF = recall_score(df_train[category], y_train_pred)
    recallRF.append(recRF)
    fRF = f1_score(df_train[category], y_train_pred)
    f1RF.append(fRF)

    
dictRF = {'Categories':categories, 'Confusion Matrix':conmatrxRF, 'Precision_RF':precisionRF, 
           'Recall_RF':recallRF, 'F1_RF':f1RF}
df_RF = pd.DataFrame(dictRF) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_RF_tweets = df_RF[df_RF['Precision_RF'] >= 0]
 
df_RF_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_RF,Recall_RF,F1_RF
0,positive,"[[4528, 311], [1368, 274]]",0.468376,0.16687,0.246071
1,negative,"[[4058, 446], [1175, 802]]",0.642628,0.405665,0.497364
2,neutral,"[[2993, 626], [1858, 1004]]",0.615951,0.350804,0.447017


In [13]:
df_cat = df_NB.Categories
df_prec_NB  =df_NB.Precision_NB
df_prec_SVC  = df_SVC.Precision_SVC
df_prec_LR  = df_LR.Precision_LR
df_prec_RF  = df_RF.Precision_RF

df_prec1 = pd.DataFrame(columns = ['Categories', 'Precision_NB', 'Precision_SVC', 
                                   'Precision_LR', 'Precision_RF','Max_Precision'])
df_prec1.Categories = df_NB.Categories
df_prec1.Precision_NB = df_NB.Precision_NB
df_prec1.Precision_SVC  = df_SVC.Precision_SVC
df_prec1.Precision_LR  = df_LR.Precision_LR
df_prec1.Precision_RF = df_RF.Precision_RF
df_prec1.Max_Precision = df_prec1.max(axis=1)
df_prec1

Unnamed: 0,Categories,Precision_NB,Precision_SVC,Precision_LR,Precision_RF,Max_Precision
0,positive,0.716667,0.510078,0.826087,0.468376,0.826087
1,negative,0.757344,0.668831,0.785714,0.642628,0.785714
2,neutral,0.70317,0.606163,0.688836,0.615951,0.70317


In [14]:
df_cat = df_NB.Categories
df_rec_NB  =df_NB.Recall_NB
df_rec_SVC  = df_SVC.Recall_SVC
df_rec_LR  = df_LR.Recall_LR
df_rec_RF  = df_RF.Recall_RF

df_rec = pd.DataFrame(columns = ['Categories', 'Recall_NB', 'Recall_SVC', 
                                   'Recall_LR', 'Recall_RF','Max_Recall'])
df_rec.Categories = df_NB.Categories
df_rec.Recall_NB = df_NB.Recall_NB
df_rec.Recall_SVC  = df_SVC.Recall_SVC
df_rec.Recall_LR  = df_LR.Recall_LR
df_rec.Recall_RF = df_RF.Recall_RF
df_rec.Max_Recall = df_rec.max(axis=1)
df_rec

Unnamed: 0,Categories,Recall_NB,Recall_SVC,Recall_LR,Recall_RF,Max_Recall
0,positive,0.026188,0.200365,0.034714,0.16687,0.200365
1,negative,0.299949,0.468892,0.250379,0.405665,0.468892
2,neutral,0.34102,0.439902,0.303983,0.350804,0.439902


In [15]:
df_cat = df_NB.Categories
df_f1_NB  =df_NB.F1_NB
df_f1_SVC  = df_SVC.F1_SVC
df_f1_LR  = df_LR.F1_LR
df_f1_RF  = df_RF.F1_RF

df_f1 = pd.DataFrame(columns = ['Categories', 'F1_NB', 'F1_SVC', 
                                   'F1_LR', 'F1_RF','Max_F1'])
df_f1.Categories = df_NB.Categories
df_f1.F1_NB = df_NB.F1_NB
df_f1.F1_SVC  = df_SVC.F1_SVC
df_f1.F1_LR  = df_LR.F1_LR
df_f1.F1_RF  = df_RF.F1_RF
df_f1.Max_F1 = df_f1.max(axis=1)
df_f1

Unnamed: 0,Categories,F1_NB,F1_SVC,F1_LR,F1_RF,Max_F1
0,positive,0.050529,0.287713,0.066628,0.246071,0.287713
1,negative,0.42971,0.551293,0.379747,0.497364,0.551293
2,neutral,0.459294,0.50982,0.421818,0.447017,0.50982


## Tune hyperparameters for SVC

In [16]:
from sklearn.model_selection import GridSearchCV

featurizer = TfidfVectorizer(stop_words='english',binary=False,max_df=50,min_df=1)
X = featurizer.fit_transform(df_train.tweet)

clf_SVC = LinearSVC(random_state=42)


loss = ['hinge', 'squared_hinge']
#kernel = ['linear', 'rbf', 'poly']
C = [1, 10, 100, 1000, 10000]


    # Create hyperparameter options
hyperparameters = dict(C=C, loss=loss)


     # Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_SVC, hyperparameters, cv=5, scoring='recall')               

for category in categories:
    # Fit grid search
    best_model = clf_tune.fit(X, df_train[category])                
                
    # View best hyperparameters
    print(category)
    #print('Best kernel:', best_model.best_estimator_.get_params()['kernel'])
    print('Best loss:', best_model.best_estimator_.get_params()['loss'])
    print('Best C:', best_model.best_estimator_.get_params()['C'],'\n')         


positive
Best loss: squared_hinge
Best C: 1000 

negative
Best loss: squared_hinge
Best C: 10000 

neutral
Best loss: hinge
Best C: 10000 



## Train the tuned LinearSVC

In [29]:
featurizer = TfidfVectorizer(stop_words='english',binary=False,max_df=50,min_df=1, ngram_range=(1, 1))
X = featurizer.fit_transform(df_train.tweet)

clf_SVC = LinearSVC(random_state=42)


#Prediciton with NB
conmatrxSVC = []
precisionSVC = []
recallSVC = []
f1SVC = []


for category in categories:
    if category == 'positive':
        clf_tunedSVC = LinearSVC(random_state=42, loss='squared_hinge', C=1000)
    if category == 'negative':
        clf_tunedSVC = LinearSVC(random_state=42, loss='squared_hinge', C=10000)
    if category == 'neutral':
        clf_tunedSVC = LinearSVC(random_state=42, loss='hinge', C=10000)
    
    y_train_pred = cross_val_predict(clf_tunedSVC, X, df_train[category], cv=5)

    
    cmSVC = confusion_matrix(df_train[category], y_train_pred)
    conmatrxSVC.append(cmSVC)
    precSVC = precision_score(df_train[category], y_train_pred)    
    precisionSVC.append(precSVC)
    recSVC = recall_score(df_train[category], y_train_pred)
    recallSVC.append(recSVC)
    fSVC = f1_score(df_train[category], y_train_pred)
    f1SVC.append(fSVC)

    
dictSVC = {'Categories':categories, 'Confusion Matrix':conmatrxSVC, 'Precision_SVC':precisionSVC, 
           'Recall_SVC':recallSVC, 'F1_SVC':f1SVC}
df_SVC = pd.DataFrame(dictSVC) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_tunedSVC_tweets = df_SVC[df_SVC['Precision_SVC'] >= 0]
 
df_tunedSVC_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_SVC,Recall_SVC,F1_SVC
0,positive,"[[3984, 855], [1113, 529]]",0.382225,0.322168,0.349636
1,negative,"[[3526, 978], [880, 1097]]",0.528675,0.554881,0.541461
2,neutral,"[[2263, 1356], [1297, 1565]]",0.535775,0.54682,0.541242


## tune hyperparameters for RF

In [19]:
vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)

X = vectorizer.fit_transform(df_train.tweet)


clf_RF2 = RandomForestClassifier(random_state=42)


n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
#max_depth = [1, 32, 32]

    # Create hyperparameter options
hyperparameters = dict(n_estimators=n_estimators)


     # Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_RF2, hyperparameters, cv=5, scoring='recall')               

for category in categories:
    # Fit grid search
    best_model = clf_tune.fit(X, df_train[category])                
                
    # View best hyperparameters
    print(category)
    #print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
    print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'],'\n')
            


positive
Best n_estimators: 16 

negative
Best n_estimators: 100 

neutral
Best n_estimators: 200 



## Train the tuned RandomForest

In [24]:
vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)

X = vectorizer.fit_transform(df_train.tweet)


#Prediciton with NB
conmatrxRF = []
precisionRF = []
recallRF = []
f1RF = []


for category in categories:
    if category == 'positive':
        clf_tunedRF = RandomForestClassifier(random_state=42, n_estimators=16)
    if category == 'negative':
        clf_tunedRF = RandomForestClassifier(random_state=42, n_estimators=100)
    if category == 'neutral':
        clf_tunedRF= RandomForestClassifier(random_state=42, n_estimators=200)
    
    y_train_pred = cross_val_predict(clf_tunedRF, X, df_train[category], cv=5)

    
    cmRF = confusion_matrix(df_train[category], y_train_pred)
    conmatrxRF.append(cmRF)
    precRF = precision_score(df_train[category], y_train_pred)    
    precisionRF.append(precRF)
    recRF = recall_score(df_train[category], y_train_pred)
    recallRF.append(recRF)
    fRF = f1_score(df_train[category], y_train_pred)
    f1RF.append(fRF)

    
dictRF = {'Categories':categories, 'Confusion Matrix':conmatrxRF, 'Precision_RF':precisionRF, 
           'Recall_RF':recallRF, 'F1_RF':f1RF}
df_RF = pd.DataFrame(dictRF) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_tunedRF_tweets = df_RF[df_RF['Precision_RF'] >= 0]
 
df_tunedRF_tweets

Unnamed: 0,Categories,Confusion Matrix,Precision_RF,Recall_RF,F1_RF
0,positive,"[[4458, 381], [1389, 253]]",0.399054,0.15408,0.22232
1,negative,"[[4005, 499], [1212, 765]]",0.605222,0.38695,0.472077
2,neutral,"[[3168, 451], [2054, 808]]",0.641779,0.28232,0.392138


## Tune hyperparameters for LR

The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. I will first leave out the solver and check for the best penalty. The I will choose the solvers that best run with that penalty.



In [25]:
from sklearn.model_selection import GridSearchCV

#silence warnings about default solver changing to 'lbfgs'
import warnings
warnings.filterwarnings("ignore")

vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)

X = vectorizer.fit_transform(df_train.tweet)



clf_LR2 = LogisticRegression(random_state=42)

#The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties.

    # Create regularization penalty space
penalty = ['l1', 'l2']

    # Create regularization hyperparameter space
C = np.logspace(-4, 4, 20)

#solver = ['liblinear', 'sag', 'newton-cg', 'saga', 'lbfgs']


    # Create hyperparameter options
hyperparameters = dict(penalty = penalty, C=C)


     # Create grid search using 5-fold cross validation
clf_tune = GridSearchCV(clf_LR2, hyperparameters, cv=5, verbose=0, scoring='f1')               

for category in categories:
    # Fit grid search
    best_model = clf_tune.fit(X, df_train[category])                
                
    # View best hyperparameters
    print(category)
    print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
    print('Best C:', best_model.best_estimator_.get_params()['C'],'\n')         


positive
Best penalty: l2
Best C: 206.913808111479 

negative
Best penalty: l1
Best C: 206.913808111479 

neutral
Best penalty: l2
Best C: 11.288378916846883 



In [26]:


vectorizer = CountVectorizer(stop_words='english',binary=False,max_df=50, ngram_range=(1, 1),
                                     min_df=1,strip_accents='unicode',max_features=200)
X = vectorizer.fit_transform(df_train.tweet)


clf_LR3 = LogisticRegression()

#The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties.

    # Create regularization penalty space
penalty = ['l2']

    # Create regularization hyperparameter space
C = np.logspace(-4, 4, 20)

solver = ['liblinear', 'sag', 'newton-cg', 'saga', 'lbfgs']


    # Create hyperparameter options
hyperparameters = dict(solver = solver, C=C)


     # Create grid search using 5-fold cross validation
clf_tuneLR = GridSearchCV(clf_LR3, hyperparameters, cv=5, verbose=0, scoring='f1')               

for category in categories:
    # Fit grid search
    best_model = clf_tuneLR.fit(X, df_train[category])                
                
    # View best hyperparameters
    print(category)
    print('Best solver:', best_model.best_estimator_.get_params()['solver'])
    print('Best C:', best_model.best_estimator_.get_params()['C'],'\n')         


positive
Best solver: sag
Best C: 545.5594781168514 

negative
Best solver: lbfgs
Best C: 545.5594781168514 

neutral
Best solver: saga
Best C: 11.288378916846883 



OK, we have a different combination for each label. I will run each separately and examine the differences before predicting on the rately and examine the differences before predicting on the unlabeled data.

In [28]:
#hyperpos = ['C=0.23357214690901212']
#hyperneg = ['penalty=l2, C=0.08858667904100823']
#hyperneut = ["penalty=l2, C=0.03359818286283781, solver='sag'"]




#Prediciton with NB
conmatrxLR = []
precisionLR = []
recallLR = []
f1LR = []


for category in categories:
    if category == 'positive':
        clf_tuned = LogisticRegression(penalty='l2',C=545.5594781168514,  solver='sag')
    if category == 'negative':
        clf_tuned = LogisticRegression(penalty='l1', C=206.913808111479, solver='liblinear')
    if category == 'neutral':
        clf_tuned = LogisticRegression(penalty='l2', C=11.288378916846883, solver='saga')
    
    y_train_pred = cross_val_predict(clf_tuned, X, df_train[category], cv=5)
    
    cmLR = confusion_matrix(df_train[category], y_train_pred)
    conmatrxLR.append(cmLR)
    precLR = precision_score(df_train[category], y_train_pred)    
    precisionLR.append(precLR)
    recLR = recall_score(df_train[category], y_train_pred)
    recallLR.append(recLR)
    fLR = f1_score(df_train[category], y_train_pred)
    f1LR.append(fLR)

dictLR = {'Categories':categories, 'Confusion Matrix':conmatrxLR, 'Precision_LR':precisionLR, 
           'Recall':recallLR, 'F1':f1LR}
df_LR = pd.DataFrame(dictLR) 
#df_NB.to_csv('Results_NB_tweets.csv')

df_tunedLR_tweets = df_LR[df_LR['Precision_LR'] >= 0]
 

df_tunedLR_tweets


Unnamed: 0,Categories,Confusion Matrix,Precision_LR,Recall,F1
0,positive,"[[4546, 293], [1423, 219]]",0.427734,0.133374,0.203343
1,negative,"[[4154, 350], [1307, 670]]",0.656863,0.338897,0.447114
2,neutral,"[[3190, 429], [2062, 800]]",0.650936,0.279525,0.391102
