### Import stuff

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPClassifier, BernoulliRBM
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, SVR, LinearSVR
from sklearn import linear_model as sklean_lms
from sklearn.naive_bayes import GaussianNB



### Approach 1

In [3]:
def read_data(filename):
    return pd.read_csv(filename) 

def get_tfidf_vectorizer(sentences):
    _m = TfidfVectorizer(max_df=0.5, max_features = 5000,
                                 min_df=2, stop_words='english',
                                 use_idf=True)
    _m.fit(sentences)
    return _m

def get_range_tfidf_vectorizer(sentences):
    _m = TfidfVectorizer(max_df=0.5, ngram_range=(2,3), max_features = 5000,
                                 min_df=1, stop_words='english',
                                 use_idf=True)
    _m.fit(sentences)
    return _m

def get_vectors(m1, m2, sentences):
    x1 = m1.transform(sentences)
    x2 = m2.transform(sentences)
    return sp.hstack([x1, x2])

class ExtendedMultiOutputClassifier(MultiOutputClassifier):
    def transform(self, X):
        """
        Add a transform method to the classifier because it is mandatory for steps of a pipeline
        to provide fit and transform methods.
        """
        # for RF
        _o = self.predict_proba(X)
        return np.concatenate(_o, axis=1)
        # for SVM
        #return self.predict(X)

In [4]:
INDATA_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/train/train.csv'

# utility definitions for easier handling of the dataset column names
TEXT_COLUMN = 'comment_text'
CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
    CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                           "insult", "identity_hate"]
CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]

# read the comments and associated classification data 
dataDf = read_data(INDATA_LOCATION)
print(dataDf.shape)    

(159571, 8)


In [5]:
dataDf.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [75]:
perf = []

## shuffle and split the dataset stratified by the number of classifications of a data point
## for balancing across resulting modeling and evaluation datasets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.8, random_state=0)
for train_index, test_index in sss.split(np.zeros(len(dataDf)), dataDf[CLASSES].sum(axis=1)):
    pass
dataDf_modeling = dataDf.iloc[train_index]
dataDf_testing = dataDf.iloc[test_index]

## lets create one doc2vec model from everything
print('\tBuilding model')
model1 = get_tfidf_vectorizer(dataDf_modeling[TEXT_COLUMN])
model2 = get_range_tfidf_vectorizer(dataDf_modeling[TEXT_COLUMN])

## lets prepare the data vectors
#X_orig = model.transform(dataDf_modeling[TEXT_COLUMN])
## lets reduce the dims
print('\tMaking vectors')
X = get_vectors(model1, model2, dataDf_modeling[TEXT_COLUMN])

	Building model
	Making vectors


In [82]:
## learn SVCs
print('\tBuilding SVRs')
moc = ExtendedMultiOutputClassifier(SVR(C=1e-6, cache_size=50, decision_function_shape='ovo', probability=True, kernel='linear', max_iter=-1, random_state=1, tol=0.001))
moc.fit(X, dataDf_modeling[CLASSES])

	Building SVRs


ExtendedMultiOutputClassifier(estimator=SVR(C=1e-06, cache_size=50, coef0=0.0, degree=3, epsilon=0.001, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
               n_jobs=1)

In [100]:
dataDf_modeling.shape

(31914, 8)

In [None]:
## learn MLP
print('\tBuilding NN')
mlp = MLPClassifier(solver='sgd', activation='logistic', learning_rate='adaptive', max_iter=1000, momentum=0.9, alpha=1e-8, hidden_layer_sizes=(100, 100), random_state=1, tol=1e-15)
mlp.fit(moc.transform(X), dataDf_modeling[CLASSES])

In [None]:
## predict on test set
print('\tPrepare test vectors')
X_test = get_vectors(model1, model2, dataDf_testing[TEXT_COLUMN])

print('\tPredicting classes')
predicted = mlp.predict_proba(moc.transform(X_test))

predictedDf = pd.DataFrame(predicted)
predictedDf.columns = CLASSES

print('\tEvaluating')
# mean auc
aucs = map(lambda klass: metrics.roc_auc_score(dataDf_testing[klass], predictedDf[klass]), CLASSES)
# MSE
d = predicted - dataDf_testing[CLASSES]
sq_difs = map(lambda x: np.dot(x, x.T), d.as_matrix())

print('\tMean AUC: %f' %np.mean(aucs))
print('MSE: %f' %(np.sum(sq_difs) * 1.0 / len(d)))

In [37]:
# if False:
TESTDATA_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/test1/test.csv'
testDf = pd.read_csv(TESTDATA_LOCATION)

X_sub = get_vectors(model1, model2, testDf[TEXT_COLUMN])
y_sub = pd.DataFrame(mlp.predict_proba(moc.transform(X_sub)))
y_sub.columns = CLASSES

subDf = pd.concat([testDf['id'], y_sub], axis=1)

subDf.head()
subDf.to_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/submission.csv', index=False)