In [24]:
import sys
import math
 
import numpy as np
from sklearn.grid_search import GridSearchCV
 
import xgboost as xgb

import pandas as pd
import numpy as np
import itertools
import sklearn.metrics as met
import scipy as sp

In [34]:
training = pd.read_csv('../../csv/chat_based_features_training.csv')
test = pd.read_csv('../../csv/chat_based_features_test.csv')

In [48]:
training = pd.read_csv('../../csv/chat_based_features_training.csv')
test = pd.read_csv('../../csv/chat_based_features_test.csv')

features = ['number of conversation', 'percent of conversations started by the author', 'number of messages sent',
            'average percent of lines in conversation', 'number of characters sent by the author']

#training_sparse_chat_based = sp.sparse.csr_matrix(training.ix[:,1:].values, dtype=float)[:,:-1]
#test_sparse_chat_based = sp.sparse.csr_matrix(test.ix[:,1:].values, dtype=float)[:,:-1]

training_sparse_chat_based = sp.sparse.csr_matrix(training.ix[:,1:-1].values, dtype=float)
test_sparse_chat_based = sp.sparse.csr_matrix(test.ix[:,1:-1].values, dtype=float)

In [None]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = dict((i, label)for label, i in self.label2num.items())
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)


def main():
    clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )
    parameters = {
        'num_boost_round': [100, 250, 500],
        'eta': [0.05, 0.1, 0.17, 0.3],
        'max_depth': [6, 9, 12, 17, 22, 30],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
        'n_estimators': [50, 100, 250, 300, 400, 500],
        'bootstrap': [True, False],
        'gamma': [3,7, 13, 17, 25],
        'scale_pos_weight': [1, 3, 5, 8, 11, 15, 18, 22],
        'learning_rate': [0.002, 0.007, 0.012, 0.027]
    }
    clf = GridSearchCV(clf, parameters, n_jobs=4, cv=2)
    
    clf.fit(training_sparse_chat_based, training[['is sexual predator']])
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
                
    print(clf.predict(test_sparse_chat_based))
    return best_parameters

In [None]:
b_p = main()

In [None]:
print b_p

In [None]:
best_parameters = {
    'colsample_bytree': 0.9,
    'eta': 0.3,
    'max_depth': 6,
    'num_boost_round': 100,
    'subsample': 1.0
}

In [None]:
xgb = xgb.XGBClassifier(b_p)
xgb.fit(training_sparse_chat_based, training[['is sexual predator']])
prediction = xgb.predict(test_sparse_chat_based)

In [None]:
print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], prediction)
print 'F1:', met.fbeta_score(test[['is sexual predator']], prediction, 1)