### Import stuff

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPClassifier, BernoulliRBM
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, SVR, LinearSVR
from sklearn import linear_model as sklean_lms
from sklearn.naive_bayes import GaussianNB

# conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE, ADASYN



In [2]:
MODELING_DATA_LOCATION = '/home/asharma/data/toxic_challenge/train.csv'
SUBMISSION_DATA_LOCATION = '/home/asharma/data/toxic_challenge/test.csv'

# utility definitions for easier handling of the dataset column names
TEXT_COLUMN = 'comment_text'
CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
    CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                           "insult", "identity_hate"]
CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]

# read the modeling data 
modelingDataDf = pd.read_csv(MODELING_DATA_LOCATION)   

In [3]:
# make training and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.35, random_state=0)
for train_index, test_index in sss.split(np.zeros(len(modelingDataDf)), modelingDataDf[CLASSES].sum(axis=1)):
    pass

trainingDataDf = modelingDataDf.iloc[train_index]
testDataDf = modelingDataDf.iloc[test_index]

In [4]:
# get embedding vectors
vectorizer = TfidfVectorizer(max_df=0.8, max_features=5000,
                                 min_df=2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3), smooth_idf=True)

X_training = vectorizer.fit_transform(trainingDataDf[TEXT_COLUMN])
X_testing = vectorizer.transform(testDataDf[TEXT_COLUMN])

In [5]:
# build RFs
rf_models = []

for klass in CLASSES:
    print('Processing %s' %klass)
    rf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, criterion="entropy", oob_score=True, verbose=0)
    X_resampled, y_resampled = SMOTE().fit_sample(X_training, trainingDataDf[klass])
    rf.fit(X_resampled, y_resampled)
    print('Rf score %f' %rf.score(X_resampled, y_resampled))
    rf_models.append(rf)

Processing toxic
Rf score 0.995863
Processing severe_toxic
Rf score 0.996431
Processing obscene
Rf score 0.996293
Processing threat
Rf score 0.996881
Processing insult
Rf score 0.995984
Processing identity_hate
Rf score 0.996694


In [6]:
# build NN
def get_rf_predictions(rfs, X):
    nn_input_data = []
    for klass, model in zip(CLASSES, rfs):
        nn_input_data.append(model.predict_proba(X))
    return np.concatenate(nn_input_data, axis=1)

print('Preparing NN')
X_nn_training = get_rf_predictions(rf_models, X_training)
mlp = MLPClassifier(solver='sgd', activation='logistic', learning_rate='adaptive', momentum=0.9, 
              alpha=1e-6, hidden_layer_sizes=(100, 100), random_state=1, tol=1e-15)
mlp.fit(X_nn_training, trainingDataDf[CLASSES])



MLPClassifier(activation='logistic', alpha=1e-06, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='sgd', tol=1e-15, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [7]:
# evaluate
print('Preparing evaluation set')
predictions = mlp.predict_proba(get_rf_predictions(rf_models, X_testing))

# MSE
d = predictions - testDataDf[CLASSES]
sq_difs = map(lambda x: np.dot(x, x.T), d.as_matrix())
print('MSE: %f' %(np.sum(sq_difs) * 1.0 / len(d)))

MSE: 0.115752


In [8]:
# prepare submission
submissionDataDf = pd.read_csv(SUBMISSION_DATA_LOCATION)   

print('Getting predictions for submission dataset')
predictions = mlp.predict_proba(get_rf_predictions(rf_models, vectorizer.transform(submissionDataDf[TEXT_COLUMN])))

y_sub = pd.DataFrame(predictions)
y_sub.columns = CLASSES

subDf = pd.concat([submissionDataDf['id'], y_sub], axis=1)

print(subDf.head())
print('Writing submissions file')
subDf.to_csv('/home/asharma/data/toxic_challenge/submission.csv', index=False)