In [3]:
# import all standard libraries
from datetime import datetime, date, timedelta
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import feather
import nltk

import warnings
warnings.filterwarnings("ignore")

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform
from statistics import mean

Choose the dataset

In [9]:
data = pd.read_csv("data/cleaned-train-balanced-sarcasm-2.csv")

In [6]:
data = pd.read_csv("data/train-balanced-sarcasm.csv")

In [10]:
data.dropna(subset=['comment'], inplace=True)
data.head(5)

Unnamed: 0,label,comment,subreddit,score,ups,downs,created_utc,parent_comment,year,month,day,hour,cleaned parent comment,cleaned comment
0,0,NC and NH.,politics,2,-1,-1,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",2016,10,16,23,yeah get that argument at this prefer is she l...,NC and NH
1,0,You do know west teams play against west teams...,nba,-4,-1,-1,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,2016,11,1,0,the blazers and mavericks the wests and seed d...,you do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G...",nfl,3,3,0,2016-09-22 21:45:37,They're favored to win.,2016,9,22,21,they re favored to win,they were underdogs earlier today but since gr...
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,-1,-1,2016-10-18 21:03:47,deadass don't kill my buzz,2016,10,18,21,deadass don kill my buzz,this meme isn funny none of the new york nigga...
4,0,I could use one of those tools.,MaddenUltimateTeam,6,-1,-1,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,2016,12,30,17,yep can confirm saw the tool they use for that...,could use of those tools


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned comment'], data['label'], random_state=15, test_size=0.2)

Hyperparameter Tuning 

In [14]:
kf = KFold(n_splits=5)

In [15]:
#applying tf-idf vectorizer
tf_idf = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=2)

#candidates for c
test_c = [0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]

#dictionary to store the parameter value and its mean accuracy in cv
tuning_dict = {}
for i in test_c:
    
    #store the accuracy of each subsamples
    accuracy_list = []
    print(f"c = {i}")
    for train_index, test_index in kf.split(X_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        
        X_train_cv, X_test_cv = X_train.reset_index(drop=True).loc[train_index, ], X_train.reset_index(drop=True).loc[test_index, ]
        y_train_cv, y_test_cv = y_train.reset_index(drop=True).loc[train_index, ], y_train.reset_index(drop=True).loc[test_index, ]    
        
        #applying logistic regression
        logit = LogisticRegression(C=i, n_jobs=8, solver='saga', random_state=17)

        #pipelining
        tfidf_logit_pipeline = Pipeline([('tf_idf', tf_idf), 
                                         ('logit', logit)])
        tfidf_logit_pipeline.fit(X_train_cv, y_train_cv)
        valid_pred_lr = tfidf_logit_pipeline.predict(X_test_cv)
        
        accuracy_list.append(accuracy_score(y_test_cv, valid_pred_lr))
        print(accuracy_score(y_test_cv, valid_pred_lr))
    
    #calculate the mean of accuracies and add it to the dictionary
    print(mean(accuracy_list))
    tuning_dict[i] = mean(accuracy_list)


c = 0.5
TRAIN: [160858 160859 160860 ... 804285 804286 804287] TEST: [     0      1      2 ... 160855 160856 160857]


KeyboardInterrupt: 

In [None]:
max_accuracy = 0
best_c = 0
for i in test_c:
    if tuning_dict[i] >= max_accuracy:
        max_accuracy = tuning_dict[i]
        best_c = i
        
print(best_c)
print(max_accuracy)

In [None]:
plt.plot(tuning_dict.keys(), tuning_dict.values())

In [None]:
#hypertuning for mnb
test_alpha = [5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6,6.1, 6.2, 6.3, 6.4, 6.5]

tuning_dict = {}
for i in test_alpha:
    accuracy_list = []
    print(i)
    for train_index, test_index in kf.split(X_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train_cv, X_test_cv = X_train.reset_index(drop=True).loc[train_index, ], X_train.reset_index(drop=True).loc[test_index, ]
        y_train_cv, y_test_cv = y_train.reset_index(drop=True).loc[train_index, ], y_train.reset_index(drop=True).loc[test_index, ]    
        #applying logistic regression
        mNB = MultinomialNB(alpha=i)

        #pipelining
        tfidf_mNB_pipeline = Pipeline([('tf_idf', tf_idf), 
                                 ('mnb', mNB)])
        tfidf_mNB_pipeline.fit(X_train_cv, y_train_cv)
        valid_pred_mNB = tfidf_mNB_pipeline.predict(X_test_cv)
        accuracy_list.append(accuracy_score(y_test_cv, valid_pred_mNB))
        print(accuracy_score(y_test_cv, valid_pred_mNB))
    print(mean(accuracy_list))
    tuning_dict[i] = mean(accuracy_list)

In [None]:
max_accuracy = 0
best_alpha = 0
for i in test_alpha:
    if tuning_dict[i] >= max_accuracy:
        max_accuracy = tuning_dict[i]
        best_alpha = i
        
print(best_alpha)
print(max_accuracy)

In [None]:
plt.plot(tuning_dict.keys(), tuning_dict.values())

In [None]:
#applying RandomForest for classification

ranforest = RandomForestClassifier(n_jobs=8,random_state=15, max_depth = 300，n_estimators = 1000)

tfidf_ranforest_pipeline = Pipeline([('tf_idf', tf_idf), 
                                 ('ranforest', ranforest)])
tfidf_ranforest_pipeline.fit(X_train, y_train)
valid_pred_ranforest = tfidf_ranforest_pipeline.predict(X_test)
print(accuracy_score(y_test_cv, valid_pred_ranforest))