In [13]:
import sys
sys.path.append('/usr/local/bin/python2.7')


import numpy as np
import os, sys, getopt, pickle, csv, sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, make_scorer, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from textblob import TextBlob
import random
import matplotlib.pyplot as plt
from sklearn import metrics
from collections import Counter
import argparse
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score    
import preprocessor as p
import pandas as pd

In [14]:
models = [ 'svm', 'naive', 'lr', 'random_forest']
NO_OF_FOLDS = 10
MODEL_TYPE = "all"
HASH_REMOVE = None
LABEL_ENCODING_TWITTER = {'racism':0,'sexism':1,'none':2}
LABEL_ENCODING_FORMSPRING = {'none':0,'bully':1}
LABEL_ENCODING_WIKI = {'none':0,'attack':1}

In [15]:
def load_data(dataset):
    data = pickle.load(open(get_filename(dataset), 'rb'))
    x_text = []
    labels = []
    for i in range(len(data)):
        if(HASH_REMOVE):
            x_text.append(p.tokenize((data[i]['text']).encode('utf-8')))
        else:
            x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    if dataset == "wiki":
        reduced_number_of_samples = int(len(x_text) * 0.5)
        x_text, labels = shuffle(x_text, labels, 
            random_state=42, 
            n_samples=reduced_number_of_samples)
        print "WARNING: Wiki data set reduced from %d to %d number of samples!" % (len(data), reduced_number_of_samples)
    return x_text,labels

def get_filename(dataset):
    global N_CLASS, HASH_REMOVE, LABEL_ENCODING
    if(dataset=="twitter"):
        filename = "data/twitter_data.pkl"
        N_CLASS = 3
        LABEL_ENCODING = LABEL_ENCODING_TWITTER
        HASH_REMOVE = False
    elif(dataset=="formspring"):
        N_CLASS = 2
        LABEL_ENCODING = LABEL_ENCODING_FORMSPRING
        filename = "data/formspring_data.pkl"
        HASH_REMOVE = False
    elif(dataset=="wiki"):
        N_CLASS = 2
        LABEL_ENCODING = LABEL_ENCODING_WIKI
        filename = "data/wiki_data.pkl"
        HASH_REMOVE = False
    return filename

In [16]:
def get_scores(y_true, y_pred):
    #dict1 = {'racism':0,'sexism':1,'none':2}
    if(data == "twitter"):
        scores = np.array([ 
                    precision_score(y_true, y_pred, average=None, labels=[2, 0, 1]), 
                    recall_score(y_true, y_pred,  average=None, labels=[2, 0, 1]),
                    f1_score(y_true, y_pred, average=None, labels=[2, 0, 1])])
    else:
        scores = np.array([ 
                    precision_score(y_true, y_pred, average=None), 
                    recall_score(y_true, y_pred,  average=None),
                    f1_score(y_true, y_pred, average=None)])
    return scores
    
def print_scores(scores):
    for i in range(N_CLASS):
        scores_class_prec = []
        scores_class_rec = []
        scores_class_f1 = []
        for foldscore in scores:
            scores_class_prec.append(foldscore[i])
            scores_class_rec.append(foldscore[i])
            scores_class_f1.append(foldscore[i])
        scores_class_prec = np.array(scores_class_prec)
        scores_class_rec = np.array(scores_class_rec)
        scores_class_f1 = np.array(scores_class_f1)
        class_string = ""
        for key, value in LABEL_ENCODING.items():
            if(value == i):
                class_string = key
        print "Precision Class %s (avg): %0.3f (+/- %0.3f)" % (class_string,scores_class_prec.mean(), scores_class_prec.std() * 2)
        print "Recall Class %s (avg): %0.3f (+/- %0.3f)" % (class_string,scores_class_rec.mean(), scores_class_rec.std() * 2)
        print "F1_score Class %s (avg): %0.3f (+/- %0.3f)" % (class_string,scores_class_f1.mean(), scores_class_f1.std() * 2)


In [17]:
arr = []
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr.append(np.hstack(np.array(["precision","recall","F1"])))
arr = np.array(arr)
print "%s" % str(arr[:, 1])

['recall' 'recall' 'recall']


In [18]:
def classification_model(X, Y, model_type, dump_enabled=False, dump_path=""):
    print "Model Type:", model_type
    kf = KFold(n_splits=NO_OF_FOLDS, random_state=42, shuffle=True)
    scores = []
    first_fold = True
    for train_index, test_index in kf.split(X):
        Y = np.asarray(Y)
        model = get_model(model_type)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        curr_scores = get_scores(y_test, y_pred) 
        scores.append(curr_scores)
    print_scores(np.array(scores))
    if dump_enabled:
        Y = np.asarray(Y)
        model = get_model(model_type)
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                            Y, 
                                                            random_state=42, 
                                                            test_size=0.10)
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        dump_results(y_test, y_pred, (dump_path % (model_type)))
    
def dump_results(y_true, y_pred, dump_path):
    pd.DataFrame(data={
            "y_true": y_true,
            "y_pred": y_pred
        }).to_csv(dump_path)
    print("Writter results to \"" + dump_path + "\"")

In [19]:
def get_model(m_type):
    if m_type == 'lr':
        logreg = LogisticRegression(class_weight="balanced")
    elif m_type == 'naive':
        logreg =  MultinomialNB()
    elif m_type == "random_forest":
        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    elif m_type == "svm":
        logreg = LinearSVC(class_weight="balanced")
    else:
        print "ERROR: Please specify a correst model"
        return None
    return logreg

In [20]:
def train(x_text, labels, MODEL_TYPE, dump_enabled=False, dump_path=""):
    
    if(WORD):
        print("Using word based features")
        bow_transformer = CountVectorizer(analyzer="word",max_features = 10000,stop_words='english').fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf
    else: 
        print("Using char n-grams based features")
        bow_transformer = CountVectorizer(max_features = 10000, ngram_range = (1,2)).fit(x_text)
        comments_bow = bow_transformer.transform(x_text)
        tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
        comments_tfidf = tfidf_transformer.transform(comments_bow)
        features = comments_tfidf
    
    if(data == "twitter"):
        labels = np.array([LABEL_ENCODING_TWITTER[b] for b in labels])
    
    from collections import Counter
    print(Counter(labels))
    
    if(MODEL_TYPE != "all"):
        classification_model(features, labels, MODEL_TYPE, dump_enabled, dump_path)
    else:
        for model_type in models:
            classification_model(features, labels, model_type, dump_enabled, dump_path)

In [9]:
data = "formspring"
WORD =  False
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/formspring_%s_ngrams.csv")

NameError: global name 'dataset' is not defined

In [16]:
data = "formspring"
WORD = True
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/formspring_%s_wordbased.csv")

Data loaded!
Using word based features
Counter({0: 11997, 1: 776})
Model Type: svm
Precision Class none (avg): 0.692 (+/- 0.557)
Recall Class none (avg): 0.692 (+/- 0.557)
F1_score Class none (avg): 0.692 (+/- 0.557)
Precision Class bully (avg): 0.739 (+/- 0.438)
Recall Class bully (avg): 0.739 (+/- 0.438)
F1_score Class bully (avg): 0.739 (+/- 0.438)
Writter results to "dumps/formspring_svm_wordbased.csv"
Model Type: naive
Precision Class none (avg): 0.757 (+/- 0.765)
Recall Class none (avg): 0.757 (+/- 0.765)
F1_score Class none (avg): 0.757 (+/- 0.765)
Precision Class bully (avg): 0.506 (+/- 0.987)
Recall Class bully (avg): 0.506 (+/- 0.987)
F1_score Class bully (avg): 0.506 (+/- 0.987)
Writter results to "dumps/formspring_naive_wordbased.csv"
Model Type: lr
Precision Class none (avg): 0.691 (+/- 0.570)
Recall Class none (avg): 0.691 (+/- 0.570)
F1_score Class none (avg): 0.691 (+/- 0.570)
Precision Class bully (avg): 0.779 (+/- 0.337)
Recall Class bully (avg): 0.779 (+/- 0.337)
F1_

In [14]:
data = "twitter"
WORD = False
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/twitter_%s_ngrams.csv")

Data loaded!
Using char n-grams based features
Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
Precision Class racism (avg): 0.803 (+/- 0.141)
Recall Class racism (avg): 0.803 (+/- 0.141)
F1_score Class racism (avg): 0.803 (+/- 0.141)
Precision Class sexism (avg): 0.805 (+/- 0.145)
Recall Class sexism (avg): 0.805 (+/- 0.145)
F1_score Class sexism (avg): 0.805 (+/- 0.145)
Precision Class none (avg): 0.803 (+/- 0.133)
Recall Class none (avg): 0.803 (+/- 0.133)
F1_score Class none (avg): 0.803 (+/- 0.133)
Writter results to "dumps/twitter_svm_ngrams.csv"
Model Type: naive
Precision Class racism (avg): 0.835 (+/- 0.119)
Recall Class racism (avg): 0.835 (+/- 0.119)
F1_score Class racism (avg): 0.835 (+/- 0.119)
Precision Class sexism (avg): 0.651 (+/- 0.450)
Recall Class sexism (avg): 0.651 (+/- 0.450)
F1_score Class sexism (avg): 0.651 (+/- 0.450)
Precision Class none (avg): 0.707 (+/- 0.248)
Recall Class none (avg): 0.707 (+/- 0.248)
F1_score Class none (avg): 0.707 (+/- 0.248)
Wri

In [15]:
data = "twitter"
WORD = True
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/twitter_%s_wordbased.csv")

Data loaded!
Using word based features
Counter({2: 11036, 1: 3117, 0: 1937})
Model Type: svm
Precision Class racism (avg): 0.810 (+/- 0.141)
Recall Class racism (avg): 0.810 (+/- 0.141)
F1_score Class racism (avg): 0.810 (+/- 0.141)
Precision Class sexism (avg): 0.810 (+/- 0.142)
Recall Class sexism (avg): 0.810 (+/- 0.142)
F1_score Class sexism (avg): 0.810 (+/- 0.142)
Precision Class none (avg): 0.809 (+/- 0.129)
Recall Class none (avg): 0.809 (+/- 0.129)
F1_score Class none (avg): 0.809 (+/- 0.129)
Writter results to "dumps/twitter_svm_wordbased.csv"
Model Type: naive
Precision Class racism (avg): 0.836 (+/- 0.109)
Recall Class racism (avg): 0.836 (+/- 0.109)
F1_score Class racism (avg): 0.836 (+/- 0.109)
Precision Class sexism (avg): 0.654 (+/- 0.443)
Recall Class sexism (avg): 0.654 (+/- 0.443)
F1_score Class sexism (avg): 0.654 (+/- 0.443)
Precision Class none (avg): 0.710 (+/- 0.242)
Recall Class none (avg): 0.710 (+/- 0.242)
F1_score Class none (avg): 0.710 (+/- 0.242)
Writter 

In [21]:
data = "wiki"
WORD = False
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/wiki_%s_ngrams.csv")

Data loaded!
Using char n-grams based features


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Counter({0: 51149, 1: 6783})
Model Type: svm
Precision Class none (avg): 0.787 (+/- 0.366)
Recall Class none (avg): 0.787 (+/- 0.366)
F1_score Class none (avg): 0.787 (+/- 0.366)
Precision Class attack (avg): 0.858 (+/- 0.151)
Recall Class attack (avg): 0.858 (+/- 0.151)
F1_score Class attack (avg): 0.858 (+/- 0.151)
Writter results to "dumps/wiki_svm_ngrams.csv"
Model Type: naive
Precision Class none (avg): 0.901 (+/- 0.078)
Recall Class none (avg): 0.901 (+/- 0.078)
F1_score Class none (avg): 0.901 (+/- 0.078)
Precision Class attack (avg): 0.752 (+/- 0.476)
Recall Class attack (avg): 0.752 (+/- 0.476)
F1_score Class attack (avg): 0.752 (+/- 0.476)
Writter results to "dumps/wiki_naive_ngrams.csv"
Model Type: lr
Precision Class none (avg): 0.789 (+/- 0.374)
Recall Class none (avg): 0.789 (+/- 0.374)
F1_score Class none (avg): 0.789 (+/- 0.374)
Precision Class attack (avg): 0.877 (+/- 0.104)
Recall Class attack (avg): 0.877 (+/- 0.104)
F1_score Class attack (avg): 0.877 (+/- 0.104)
Writ

In [22]:
data = "wiki"
WORD = True
x_text, labels = load_data(data) 
print ("Data loaded!")
train(x_text, labels, MODEL_TYPE, True, "dumps/wiki_%s_wordbased.csv")

Data loaded!
Using word based features
Counter({0: 51149, 1: 6783})
Model Type: svm
Precision Class none (avg): 0.788 (+/- 0.366)
Recall Class none (avg): 0.788 (+/- 0.366)
F1_score Class none (avg): 0.788 (+/- 0.366)
Precision Class attack (avg): 0.859 (+/- 0.148)
Recall Class attack (avg): 0.859 (+/- 0.148)
F1_score Class attack (avg): 0.859 (+/- 0.148)
Writter results to "dumps/wiki_svm_wordbased.csv"
Model Type: naive
Precision Class none (avg): 0.927 (+/- 0.022)
Recall Class none (avg): 0.927 (+/- 0.022)
F1_score Class none (avg): 0.927 (+/- 0.022)
Precision Class attack (avg): 0.738 (+/- 0.514)
Recall Class attack (avg): 0.738 (+/- 0.514)
F1_score Class attack (avg): 0.738 (+/- 0.514)
Writter results to "dumps/wiki_naive_wordbased.csv"
Model Type: lr
Precision Class none (avg): 0.798 (+/- 0.352)
Recall Class none (avg): 0.798 (+/- 0.352)
F1_score Class none (avg): 0.798 (+/- 0.352)
Precision Class attack (avg): 0.873 (+/- 0.125)
Recall Class attack (avg): 0.873 (+/- 0.125)
F1_sco