### Hyperparameter Optimisation for Model 

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neural_network import MLPClassifier, BernoulliRBM
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
    
def read_data(filename):
    return pd.read_csv(filename) 

class CommentVectorizer:
    def __init__(self):
        self._vectorizers = []
        
    def get_count_vectorizer(self, max_features = 1000, ngram_range = (1, 2), 
                             stop_words = 'english', binary = True):
        """
        Initializes a count vectorizer with parameters set by the user and 
        returns an index in the internal vector array where the vectorizer
        has been placed. We dont want any external entity manipulating the
        vectorizer state directly.
        """
        self._vectorizers.append(CountVectorizer(max_features = max_features, 
                                                 ngram_range = ngram_range, 
                                                 stop_words = stop_words,
                                                 binary = binary))
        return len(self._vectorizers) - 1
    
    def get_tdidf_vectorizer(self, max_features = 1000, use_idf = True):
        self._vectorizers.append(TfidfVectorizer(max_df=0.5, max_features=max_features,
                                 min_df=2, stop_words='english',
                                 use_idf=use_idf))
        return len(self._vectorizers) - 1
    
    def doc2vec_fit_transform(self, sentences):
        """
        Method for building a doc2vec model. Unfortunately it does not follow the nice fit/transform
        pattern of the scikit models.
        """
        documents = map(lambda i: TaggedDocument(sentences[i].split(), [i]),
                        range(len(sentences)))
        self._doc2vec_model = Doc2Vec(documents, size=100, window=8, min_count=0, workers=8)
        return map(lambda x: self._doc2vec_model.docvecs[x], range(len(sentences)))
    
    def doc2vec_transform(self, sentences):
        vectors = []
        for i in range(len(sentences)):
            vectors.append(self._doc2vec_model.infer_vector(sentences[i].split()))
        return vectors
    
    def _exists(self, vectorizer):
        """
        Checks if the vectorizer index provided points to a valid vectorizer.
        """
        if vectorizer < 0 or len(self._vectorizers) <= vectorizer:
            raise Exception('Vectorizer index out of bound.')
            
        if self._vectorizers[vectorizer] == None:
            raise Exception('Vectorizer not initialized.')
            
        pass
        
    def fit(self, comments = [], vectorizer = -1):
        self._exists(vectorizer)
        self._vectorizers[vectorizer].fit(comments)
        
    def transform(self, comments, vectorizer):
        self._exists(vectorizer)
        return self._vectorizers[vectorizer].transform(comments)  

class ExtendedMultiOutputClassifier(MultiOutputClassifier):
    def transform(self, X):
        """
        Add a transform method to the classifier because it is mandatory for steps of a pipeline
        to provide fit and transform methods.
        """
        _o = self.predict_proba(X)
        return np.concatenate(_o, axis=1)
    
def multi_roc_auc(ground_truth, predictions):
    roc_aucs = []
    gt = ground_truth.as_matrix()
    for col_index in range(ground_truth.shape[1]):
        g_c = gt[:, col_index]
        p_c = predictions[:, col_index]
        roc_aucs.append(metrics.roc_auc_score(g_c, p_c))
    return np.mean(roc_aucs)
custom_scorer = make_scorer(multi_roc_auc, greater_is_better=True)

if __name__ == '__main__'    :
    # path to the train data file
    INDATA_LOCATION = 'C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/train/train.csv'
    
    # utility definitions for easier handling of the dataset column names
    TEXT_COLUMN = 'comment_text'
    CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, \
        CLASS_IDENTITY_HATE = ["toxic", "severe_toxic", "obscene", "threat", \
                               "insult", "identity_hate"]
    CLASSES = [CLASS_TOXIC, CLASS_SEVER_TOXIC, CLASS_OBSCENE, CLASS_THREAT, CLASS_INSULT, CLASS_IDENTITY_HATE]
    
    # read the comments and associated classification data 
    dataDf = read_data(INDATA_LOCATION)
    
    # initialize a count vectorizer for this experiment    
    commentVectorizer = CommentVectorizer()
    #commentVectors = np.array(commentVectorizer.doc2vec_fit_transform(dataDf[TEXT_COLUMN]))
    vectorizer = commentVectorizer.get_tdidf_vectorizer()
    commentVectorizer.fit(dataDf[TEXT_COLUMN], vectorizer)
    commentVectors = commentVectorizer.transform(dataDf[TEXT_COLUMN], vectorizer)
    
    # set this true to evaluate combination of parameters
    if True:
        print("# Tuning hyper-parameters for custom_scorer")
        print()
    
        moc = ExtendedMultiOutputClassifier(RandomForestClassifier(n_estimators = 50, class_weight = 'balanced', n_jobs=-1, criterion="entropy", oob_score=True))
        nnc = MLPClassifier(solver='sgd', random_state=1)
        pipeline = Pipeline([('moc', moc), ('nnc', nnc)])
    
        # note unfortunately since the actual classifier is wrapped inside the
        # multioutputclassifier there is not _easy_ of tuning the parameters
        # of that classifier. maybe it should be done separately
        param_grid = dict(nnc__alpha=[.0001, .00001, .000001], nnc__hidden_layer_sizes=range(15, 50, 10), nnc__activation=['identity', 'tanh', 'logistic', 'relu'], nnc__learning_rate=['constant', 'invscaling', 'adaptive'], nnc__momentum=[0.9,0.99,0.999], nnc__tol=[.0001, .00001, .000001])
    
        clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=custom_scorer)
        clf.fit(commentVectors, dataDf[CLASSES])
    
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()    
    
    # set the desired parameters here for generating predictions on test set
    if False:
        moc = ExtendedMultiOutputClassifier(RandomForestClassifier(n_estimators = 20, class_weight = 'balanced'))
        nnc = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
        # specify the order in which pipeline should execute the classifiers/estimators
        clf = Pipeline([('emc', moc), ('nnc', nnc)])
        # fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator.
        clf.fit(commentVectors, dataDf[CLASSES])
        
        # test data
        testdf = pd.read_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/Datasets/test1/test.csv')
        testdf.head()
        # embed comments into vector space
        testCommentVectors = commentVectorizer.transform(testdf[TEXT_COLUMN], vectorizer)
        #commentVectors
        #testCommentVectors = np.array(commentVectorizer.doc2vec_transform(testdf[TEXT_COLUMN]))
        testpredictions = clf.predict_proba(testCommentVectors)
        testpdf = pd.DataFrame(data=testpredictions)
        submissiondf = testpdf.join(testdf['id'], how='left')
        submissiondf = submissiondf[['id',0,1,2,3,4,5]]
        submissiondf.columns = ["id", "toxic", "severe_toxic", "obscene", "threat","insult", "identity_hate"]
        submissiondf.head()
        submissiondf.to_csv('C:/Users/sharm/Desktop/Dat5Melb/Final_Project/submission.csv',index=False)