In [30]:
import sys
sys.path.append('/usr/local/bin/python2.7')


import numpy as np
import os, sys, getopt, pickle, csv, sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, make_scorer, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from textblob import TextBlob
import random
import matplotlib.pyplot as plt
from sklearn import metrics
from collections import Counter
import argparse
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score    
import preprocessor as p

In [31]:
models = [ 'svm', 'naive', 'lr', 'random_forest']
NO_OF_FOLDS = 10
MODEL_TYPE = "all"
HASH_REMOVE = None

In [32]:
def load_data(filename):
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    for i in range(len(data)):
        if(HASH_REMOVE):
            x_text.append(p.tokenize((data[i]['text']).encode('utf-8')))
        else:
            x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    return x_text,labels

def get_filename(dataset):
    global N_CLASS, HASH_REMOVE
    if(dataset=="twitter"):
        filename = "data/twitter_data.pkl"
        N_CLASS = 3
        HASH_REMOVE = False
    elif(dataset=="formspring"):
        N_CLASS = 2
        filename = "data/formspring_data.pkl"
        HASH_REMOVE = False
    elif(dataset=="wiki"):
        N_CLASS = 2
        filename = "data/wiki_data.pkl"
        HASH_REMOVE = False
    return filename

In [33]:
def get_scores(y_true, y_pred):
#     if(data=="wiki"):
#         auc = roc_auc_score(y_true,y_pred)
#         print('Test ROC AUC: %.3f' %auc)
#     print(":: Confusion Matrix")
#     print(confusion_matrix(y_true, y_pred))
#     print(":: Classification Report")
#     print(classification_report(y_true, y_pred))
    return np.array([ 
            precision_score(y_true, y_pred, average=None), 
            recall_score(y_true, y_pred,  average=None),
            f1_score(y_true, y_pred, average=None)])
    
def print_scores(scores):
    for i in range(N_CLASS):
        if(i!=0):
            print "Precision Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, i].mean(), scores[:, i].std() * 2)
            print "Recall Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:,  N_CLASS+i].mean(), scores[:,N_CLASS+i].std() * 2)
            print "F1_score Class %d (avg): %0.3f (+/- %0.3f)" % (i,scores[:, N_CLASS*2+i].mean(), scores[:,  N_CLASS*2+i].std() * 2)


In [28]:
arr = []
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr = np.array(arr)
print "%s" % str(arr[:, 1])

['recall' 'recall' 'recall']


In [34]:
def classification_model(X, Y, model_type):
    X, Y = shuffle(X, Y, random_state=42)
    print "Model Type:", model_type
    kf = KFold(n_splits=NO_OF_FOLDS)
    scores = []
    for train_index, test_index in kf.split(X):
        Y = np.asarray(Y)
        model = get_model(model_type)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        curr_scores = get_scores(y_test, y_pred) 
            #[
            #   [precision, precision, precision], -> length == number of classes
            #   [recall, recall, recall], 
            #   [f1, f1, f1]
            #]
        print(curr_scores)
        scores.append(np.hstack(curr_scores))
        # [
        #   [precision, ... , precision] -> len == number of folds
        #   [recall, ... , recall]
        #   [f1, ... , f1]
        # ]
    print_scores(np.array(scores))

In [35]:
def get_model(m_type):
    if m_type == 'lr':
        logreg = LogisticRegression(class_weight="balanced")
    elif m_type == 'naive':
        logreg =  MultinomialNB()
    elif m_type == "random_forest":
        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    elif m_type == "svm":
        logreg = LinearSVC(class_weight="balanced")
    else:
        print "ERROR: Please specify a correst model"
        return None
    return logreg

In [36]:
def train(x_text, labels, MODEL_TYPE):
    
    if(WORD):
        print("Using word based features")
        bow_transformer = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english').fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf
    else: 
        print("Using char n-grams based features")
        bow_transformer = CountVectorizer(max_features = 10000, ngram_range = (1,2)).fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf
    
    if(data == "twitter"):
        dict1 = {'racism':0,'sexism':1,'none':2}
        labels = np.array([dict1[b] for b in labels])
    
    from collections import Counter
    print(Counter(labels))
    
    if(MODEL_TYPE != "all"):
        classification_model(features, labels, MODEL_TYPE)
    else:
        for model_type in models:
            classification_model(features, labels, model_type)

In [None]:
data = "formspring"
WORD =  False
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features
Counter({0: 11997, 1: 776})
Model Type: svm
Precision Class 1 (avg): 0.466 (+/- 0.109)
Recall Class 1 (avg): 0.503 (+/- 0.122)
F1_score Class 1 (avg): 0.483 (+/- 0.104)
Model Type: naive
Precision Class 1 (avg): 0.850 (+/- 0.640)
Recall Class 1 (avg): 0.015 (+/- 0.015)
F1_score Class 1 (avg): 0.030 (+/- 0.028)
Model Type: lr
Precision Class 1 (avg): 0.410 (+/- 0.099)
Recall Class 1 (avg): 0.626 (+/- 0.131)
F1_score Class 1 (avg): 0.495 (+/- 0.104)
Model Type: random_forest
Precision Class 1 (avg): 0.735 (+/- 0.230)
Recall Class 1 (avg): 0.161 (+/- 0.084)
F1_score Class 1 (avg): 0.261 (+/- 0.122)


In [None]:
data = "formspring"
WORD = True
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)

Data loaded!
Using word based features
Counter({0: 11997, 1: 776})
Model Type: svm
Precision Class 1 (avg): 0.415 (+/- 0.089)
Recall Class 1 (avg): 0.525 (+/- 0.132)
F1_score Class 1 (avg): 0.463 (+/- 0.100)
Model Type: naive
Precision Class 1 (avg): 0.575 (+/- 0.950)
Recall Class 1 (avg): 0.013 (+/- 0.029)
F1_score Class 1 (avg): 0.025 (+/- 0.055)
Model Type: lr
Precision Class 1 (avg): 0.407 (+/- 0.079)
Recall Class 1 (avg): 0.617 (+/- 0.127)
F1_score Class 1 (avg): 0.489 (+/- 0.084)
Model Type: random_forest
Precision Class 1 (avg): 0.695 (+/- 0.264)
Recall Class 1 (avg): 0.162 (+/- 0.067)
F1_score Class 1 (avg): 0.261 (+/- 0.098)


In [37]:
data = "twitter"
WORD = False
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
[[0.74883721 0.79927007 0.87857143]
 [0.77777778 0.70192308 0.90275229]
 [0.76303318 0.74744027 0.89049774]]
[[0.7092511  0.78360656 0.9090065 ]
 [0.81725888 0.79139073 0.88198198]
 [0.75943396 0.78747941 0.89529035]]
[[0.72815534 0.82042254 0.90169794]
 [0.83798883 0.7327044  0.9073741 ]
 [0.77922078 0.77408638 0.90452712]]
[[0.76530612 0.79931973 0.86327078]
 [0.75376884 0.6851312  0.90534208]
 [0.75949367 0.73783359 0.88380604]]
[[0.705      0.8172043  0.89823009]
 [0.7877095  0.73786408 0.90544157]
 [0.74406332 0.7755102  0.90182141]]
[[0.74881517 0.74914089 0.87985547]
 [0.73831776 0.72666667 0.88949772]
 [0.74352941 0.73773266 0.88465032]]
[[0.69154229 0.74223602 0.90331492]
 [0.74731183 0.78618421 0.8766756 ]
 [0.71834625 0.76357827 0.88979592]]
[[0.69856459 0.78368794 0.89534884]
 [0.80662983 0.71521036 0.8945487 ]
 [0.74871795 0.74788494 0.89494859]]
[[0.73267327 0.81045752 0.89009991]
 [0.75897436 0.7654321  0.89908257]
 [

In [None]:
data = "twitter"
WORD = True
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)

Data loaded!
Using word based features
Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
Precision Class 1 (avg): 0.803 (+/- 0.044)
Recall Class 1 (avg): 0.744 (+/- 0.052)
F1_score Class 1 (avg): 0.772 (+/- 0.037)
Precision Class 2 (avg): 0.893 (+/- 0.023)
Recall Class 2 (avg): 0.901 (+/- 0.018)
F1_score Class 2 (avg): 0.897 (+/- 0.009)
Model Type: naive
Precision Class 1 (avg): 0.904 (+/- 0.035)
Recall Class 1 (avg): 0.469 (+/- 0.056)
F1_score Class 1 (avg): 0.617 (+/- 0.051)
Precision Class 2 (avg): 0.806 (+/- 0.022)
Recall Class 2 (avg): 0.963 (+/- 0.007)
F1_score Class 2 (avg): 0.877 (+/- 0.011)
Model Type: lr
Precision Class 1 (avg): 0.832 (+/- 0.039)
Recall Class 1 (avg): 0.663 (+/- 0.083)
F1_score Class 1 (avg): 0.738 (+/- 0.062)
Precision Class 2 (avg): 0.875 (+/- 0.026)
Recall Class 2 (avg): 0.916 (+/- 0.012)
F1_score Class 2 (avg): 0.895 (+/- 0.012)
Model Type: random_forest
Precision Class 1 (avg): 0.875 (+/- 0.042)
Recall Class 1 (avg): 0.643 (+/- 0.082)
F1_score Class 

In [None]:
data = "wiki"
WORD = False
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)

Data loaded!
Using char n-grams based features
Counter({0: 102274, 1: 13590})
Model Type: svm
Precision Class 1 (avg): 0.591 (+/- 0.025)
Recall Class 1 (avg): 0.823 (+/- 0.019)
F1_score Class 1 (avg): 0.688 (+/- 0.018)
Model Type: naive
Precision Class 1 (avg): 0.839 (+/- 0.010)
Recall Class 1 (avg): 0.554 (+/- 0.028)
F1_score Class 1 (avg): 0.667 (+/- 0.021)
Model Type: lr
Precision Class 1 (avg): 0.602 (+/- 0.024)
Recall Class 1 (avg): 0.845 (+/- 0.022)
F1_score Class 1 (avg): 0.703 (+/- 0.017)
Model Type: random_forest


KeyboardInterrupt: 

In [None]:
data = "wiki"
WORD = True
x_text, labels = load_data(get_filename(data)) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE)