### Imports for this Notebook

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPClassifier, BernoulliRBM
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC



### Read data

In [2]:
# path to the train data file
INDATA_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/train/train_orig.csv'
#INDATA_LOCATION = '/home/asharma/data/toxic_challenge/train.csv'

# utility definitions for easier handling of the dataset column names
TEXT_COLUMN = 'comment_text'
CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
    CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                           "insult", "identity_hate"]
CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]

def read_data(filename):
    return pd.read_csv(filename) 

# read the comments and associated classification data 
dataDf = read_data(INDATA_LOCATION)
dataDf['klass_count'] = dataDf[CLASSES].sum(axis=1)

### Basic data characteristics

In [3]:
def basic_characteristics(df):
    print('Number of data points: %d' %len(df))
    for klass in CLASSES:
        print('Number data points of type %s: %d' %(klass, len(df[df[klass]==1])))        
basic_characteristics(dataDf)

Number of data points: 159571
Number data points of type toxic: 15294
Number data points of type severe_toxic: 1595
Number data points of type obscene: 8449
Number data points of type threat: 478
Number data points of type insult: 7877
Number data points of type identity_hate: 1405


In [4]:
if False:
    labels, counts = np.unique(dataDf[CLASSES].sum(axis=1), return_counts=True)
    plt.bar(labels, counts, align='center')
    plt.gca().set_title('Histogram of number of classes per datapoint')
    plt.gca().set_xlabel('Number of classes per datapoint')
    plt.gca().set_xticks(labels)
    plt.show()

### Embed text in vector space

We use a simple count based vectorizer to embed the comment text into vector space in preparation for building classification models.

In [None]:
class CommentVectorizer:
    def __init__(self):
        self._vectorizers = []
        
    def get_count_vectorizer(self, max_features = 1000, ngram_range = (1, 2), 
                             stop_words = 'english', binary = True):
        """
        Initializes a count vectorizer with parameters set by the user and 
        returns an index in the internal vector array where the vectorizer
        has been placed. We dont want any external entity manipulating the
        vectorizer state directly.
        """
        self._vectorizers.append(CountVectorizer(max_features = max_features, 
                                                 ngram_range = ngram_range, 
                                                 stop_words = stop_words,
                                                 binary = binary))
        return len(self._vectorizers) - 1
    
    def get_tdidf_vectorizer(self, max_features = 5000, use_idf = True):
        self._vectorizers.append(TfidfVectorizer(max_df=0.8, max_features=max_features,
                                 min_df=2, stop_words='english',
                                 use_idf=use_idf, ngram_range=(1,3), smooth_idf=True))
        return len(self._vectorizers) - 1
    
    def doc2vec_fit_transform(self, sentences):
        """
        Method for building a doc2vec model. Unfortunately it does not follow the nice fit/transform
        pattern of the scikit models.
        """
        documents = map(lambda i: TaggedDocument(sentences[i].split(), [i]),
                        range(len(sentences)))
        model = Doc2Vec(documents, size=5000, window=16, min_count=10, workers=8)
        return map(lambda x: model.docvecs[x], range(len(sentences)))
    
    def doc2vec_transform(self, sentences):
        vectors = []
        for i in range(len(sentences)):
            vectors.append(self._doc2vec_model.infer_vector(sentences[i].split()))
        return vectors
    
    def _exists(self, vectorizer):
        """
        Checks if the vectorizer index provided points to a valid vectorizer.
        """
        if vectorizer < 0 or len(self._vectorizers) <= vectorizer:
            raise Exception('Vectorizer index out of bound.')
            
        if self._vectorizers[vectorizer] == None:
            raise Exception('Vectorizer not initialized.')
            
        pass
        
    def fit(self, comments = [], vectorizer = -1):
        self._exists(vectorizer)
        self._vectorizers[vectorizer].fit(comments)
        
    def transform(self, comments, vectorizer):
        self._exists(vectorizer)
        return self._vectorizers[vectorizer].transform(comments)    
    
    
def get_doc2vec_model(sentences):
    _s = sentences.tolist()
    documents = map(lambda i: TaggedDocument(_s[i].split(), [i]), range(len(_s)))
    return Doc2Vec(documents, size=100, window=8, min_count=30, workers=8, dm=1, hs=0, dbow_words=0, dm_concat=1)

def get_doc2vec_vectors(model, sentences):
    _s = sentences.tolist()
    return map(lambda i: model.infer_vector(_s[i].split()), range(len(_s)))

In [None]:
# initialize a count vectorizer for this experiment    
#commentVectorizer = CommentVectorizer()
#vectorizer = commentVectorizer.get_tdidf_vectorizer()
#commentVectorizer.fit(dataDf[TEXT_COLUMN], vectorizer)

# embed comments into vector space
#commentVectors = commentVectorizer.transform(dataDf[TEXT_COLUMN], vectorizer)

model = get_doc2vec_model(dataDf[TEXT_COLUMN])
print(model)
# commentVectors = map(lambda x: model.docvecs[x], range(len(dataDf)))
commentVectors = np.array(get_doc2vec_vectors(model, dataDf[TEXT_COLUMN]))

In [None]:
if False:
    coords = TruncatedSVD(n_components=2).fit_transform(commentVectors)
    plt.scatter(coords[:,0], coords[:,1], color='red' )
    plt.title('Scatter plot of the comment vectors (reduced)')

### Create modeling and evaluation sets

In [None]:
# shuffle and split the dataset stratified by the number of classifications of a data point
# for balancing across resulting modeling and evaluation datasets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(np.zeros(len(dataDf)), dataDf[CLASSES].sum(axis=1)):
    pass

# modeling dataset
modeling_vectors = commentVectors[train_index]
modeling_classes = dataDf[CLASSES].loc[train_index]
print('Modeling data size: %d' %len(modeling_classes))
#basic_characteristics(modeling_classes)

# evaluation dataset
evaluation_vectors = commentVectors[test_index]
evaluation_classes = dataDf[CLASSES].loc[test_index]
print('Evaluation data size: %d' %len(evaluation_classes))
#basic_characteristics(evaluation_classes)

### Build classification model

In [None]:
if False:
    X = modeling_vectors
    y = modeling_classes


    rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

    param_grid = { 
        'n_estimators': [5, 10],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X, y)
    print CV_rfc.best_params_

In [None]:
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
#     max_iter=-1, probability=False, random_state=None, shrinking=True,
#     tol=0.001, verbose=False)
# RandomForestClassifier(n_estimators = 100, class_weight = 'balanced', n_jobs=-1, criterion="entropy", oob_score=True)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier

class ExtendedMultiOutputClassifier(MultiOutputClassifier):
    def transform(self, X):
        """
        Add a transform method to the classifier because it is mandatory for steps of a pipeline
        to provide fit and transform methods.
        """
        _o = self.predict_proba(X)
        return np.concatenate(_o, axis=1)

moc = ExtendedMultiOutputClassifier(SVC(C=1.0, cache_size=50, class_weight='balanced', decision_function_shape='ovr', gamma='auto', kernel='linear', max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001))
nnc = MLPClassifier(solver='sgd', activation='logistic', learning_rate='adaptive', momentum=0.9, alpha=1e-6, hidden_layer_sizes=(100, 100), random_state=1, tol=1e-15)
# specify the order in which pipeline should execute the classifiers/estimators
clf = Pipeline([('moc_rf', moc), ('nnc', nnc)])
# fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator.
clf.fit(modeling_vectors, modeling_classes)
# rudimentary test
predictions = clf.predict_proba(modeling_vectors)

# calculate MSE (mean squared error). note np.dot on full matrix gives 
# memmory error hence the slow work around
d = predictions - modeling_classes
sq_difs = map(lambda x: np.dot(x, x.T), d.as_matrix())
print('MSE: %f' %(np.sum(sq_difs) * 1.0 / len(d)))

### Evaluation

In [None]:
predictions = clf.predict_proba(evaluation_vectors)

# MSE
d = predictions - evaluation_classes
sq_difs = map(lambda x: np.dot(x, x.T), d.as_matrix())
print('MSE: %f' %(np.sum(sq_difs) * 1.0 / len(d)))

### Read in test dataset

In [None]:
#testdf = pd.read_csv('/home/asharma/data/toxic_challenge/test.csv')
testdf = pd.read_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/test1/test.csv')
testdf.id = testdf.id.astype(basestring)
testdf.head()

In [None]:
testdf.info()

### Embded test dataset

In [None]:
# embed comments into vector space
#testcommentVectors = commentVectorizer.transform(testdf[TEXT_COLUMN], vectorizer)
testCommentVectors = np.array(get_doc2vec_vectors(model, testdf[TEXT_COLUMN]))

testcommentVectors

### Predict final probabilities

In [None]:
testpredictions = clf.predict_proba(testcommentVectors)
testpdf = pd.DataFrame(data=testpredictions)

In [None]:
submissiondf = testpdf.join(testdf['id'], how='left')
submissiondf = submissiondf[['id',0,1,2,3,4,5]]
submissiondf.columns = ["id", "toxic", "severe_toxic", "obscene", "threat","insult", "identity_hate"]
submissiondf.head()

In [None]:
#submissiondf.to_csv('/home/asharma/data/toxic_challenge/submission.csv', index=False)

def manual_write(submissiondf,filename):
    of = file(filename, 'w')
    of.write('%s\n' %','.join(submissiondf.columns))
    for idx in range(len(submissiondf)):
        of.write('%s\n' %','.join(map(str, submissiondf.iloc[idx].tolist())))
    of.close()

manual_write(submissiondf, 'submission2.csv')

In [None]:
submissiondf.info()