In [1]:
import pandas as pd
import numpy as np
#import itertools
import sklearn.metrics as met
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import scipy as sp

import sys
sys.path.insert(0, '../../notebooks/libs/')
import FeatureExtraction as FE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler

In [310]:
training = pd.read_csv('../../csv/chat_based_features_training.csv')
test = pd.read_csv('../../csv/chat_based_features_test.csv')

In [303]:
# Filtering chat based
#minimal_number_of_messages_treshold = 2
#training = training[training['number of messages sent'] >= minimal_number_of_messages_treshold]
#test = test[test['number of messages sent'] >= minimal_number_of_messages_treshold]

In [311]:
features = ['number of conversation', 'percent of conversations started by the author', 'number of messages sent', 'average percent of lines in conversation', 'number of characters sent by the author']

training_sparse = sp.sparse.csr_matrix(training[features].ix[:,1:].values, dtype=float)[:,:-1]
test_sparse = sp.sparse.csr_matrix(test[features].ix[:,1:].values, dtype=float)[:,:-1]

In [312]:
training_xml = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
test_xml = '../../dataset/test/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'

In [None]:
# Filtering TF-IDF
# Original documents_training = FE.prepare_for_tf_idf(training_xml, False)

documents_training = FE.prepare_for_tf_idf(training_xml, False)
documents_test = FE.prepare_for_tf_idf(test_xml, False)

In [None]:
tfidf=TfidfVectorizer(stop_words='english', min_df=3, max_features=2500)
matrix_training=tfidf.fit_transform(documents_training)
matrix_testing=tfidf.transform(documents_test)

In [None]:
print matrix_training.shape
print training_sparse.shape

In [None]:
training_all = sp.sparse.hstack((training_sparse, matrix_training))
test_all = sp.sparse.hstack((test_sparse, matrix_testing))

In [None]:
scaler=StandardScaler(with_mean=False)
training_all=scaler.fit_transform(training_all)
test_all=scaler.transform(test_all)

In [None]:
model = xgb.XGBClassifier(max_depth=14, n_estimators=300, learning_rate=0.2, scale_pos_weight=5, gamma=4,
                          min_child_weight=5, subsample=1)

model.fit(training_all, training[['is sexual predator']])

In [None]:
prediction = model.predict(test_all)

In [None]:
print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], prediction)
print 'F1:', met.f1_score(test[['is sexual predator']], prediction)
print 'F0.5:', met.fbeta_score(test[['is sexual predator']], prediction, 0.5)

In [None]:
column_names = training.columns.values.tolist()[1:-1]
print column_names

In [72]:
#training.tail()

In [280]:
# Remove scale_post_weight -> smaller F1 score, lower recall but increase precision
# Remove gama for bigger precision
sex_offender0 = xgb.XGBClassifier(max_depth=13, n_estimators=300, learning_rate=0.007, scale_pos_weight=5, gamma=7)
sex_offender1 = xgb.XGBClassifier(max_depth=25, n_estimators=300)
sex_offender2 = xgb.XGBClassifier(max_depth=5, n_estimators=220, learning_rate=0.007, scale_pos_weight=20, gamma=2)
sex_offender3 = xgb.XGBClassifier(max_depth=3, n_estimators=400)
sex_offender4 = xgb.XGBClassifier(max_depth=7, n_estimators=100)

In [281]:
sex_offender0.fit(training_all, training.iloc[:,[9]])
sex_offender1.fit(training_all, training.iloc[:,[9]])
sex_offender2.fit(training_all, training.iloc[:,[9]])
sex_offender3.fit(training_all, training.iloc[:,[9]])
sex_offender4.fit(training_all, training.iloc[:,[9]])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [282]:
prediction0 = sex_offender0.predict(test_all)
prediction1 = sex_offender1.predict(test_all)
prediction2 = sex_offender2.predict(test_all)
prediction3 = sex_offender3.predict(test_all)
prediction4 = sex_offender4.predict(test_all)

In [283]:
def bagging(vote_number, prediction_list):
    total_prediction = []
    for i in range(len(prediction_list[0])):
        voters = 0
        
        for prediction in prediction_list:
            if prediction[i] == 1:
                voters += 1
                
        if voters >= vote_number:
            total_prediction.append(1)
        else:
            total_prediction.append(0)
                
    return np.array(total_prediction)

In [288]:
prediction = bagging(4, [prediction0, prediction1, prediction2, prediction3, prediction4])

In [289]:
print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], prediction)
print 'F1:', met.fbeta_score(test[['is sexual predator']], prediction, 1)
print 'F0.5:', met.fbeta_score(test[['is sexual predator']], prediction, 0.5)

Accuracy:  0.999286700625
Precision:  0.855072463768
Recall: 0.464566929134
F1: 0.602040816327
F0.5: 0.732009925558


In [None]:
forest = RandomForestClassifier(n_estimators = 100)

max_f1 = 0
best_features = []

for num_features in range(2, len(column_names)-1):
    for column_name_subset in itertools.combinations(column_names, num_features):

        forest = forest.fit(training[list(column_name_subset)], np.ravel(training.iloc[:,[9]]))
        prediction = forest.predict(test[list(column_name_subset)])

        print column_name_subset
        print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], prediction) 
        print 'Precision: ', met.precision_score(test[['is sexual predator']], prediction)
        print 'Recall:', met.recall_score(test[['is sexual predator']], prediction)
        print 'F1:', met.f1_score(test[['is sexual predator']], prediction)
        print "\n\n"

        f1 = met.f1_score(test[['is sexual predator']], prediction)
        if max_f1 < f1:
            max_f1 = f1
            best_features =  column_name_subset
                                                                                                 
print max_f1
print best_features