In [None]:
import pandas as pd
import numpy as np
import itertools
import sklearn.metrics as met
import scipy as sp

import sys
sys.path.insert(0, '../../notebooks/libs/')
import FeatureExtraction as FE


from sklearn import svm
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score

import sklearn.metrics as met
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler


from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.grid_search import GridSearchCV

In [None]:
training = pd.read_csv('../../csv/chat_based_features_training.csv')
test = pd.read_csv('../../csv/chat_based_features_test.csv')

features = ['number of conversation', 'percent of conversations started by the author', 'number of messages sent',
            'average percent of lines in conversation', 'number of characters sent by the author']

training_sparse_chat_based = sp.sparse.csr_matrix(training.ix[:,1:].values, dtype=float)[:,:-1]
test_sparse_chat_based = sp.sparse.csr_matrix(test.ix[:,1:].values, dtype=float)[:,:-1]

In [None]:
training_xml = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
test_xml = '../../dataset/test/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'

documents_training = FE.prepare_for_tf_idf(training_xml, False)
documents_test = FE.prepare_for_tf_idf(test_xml, False)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', min_df=3, max_features=2500)

training_sparse_tfidf = tfidf.fit_transform(documents_training)
testing_sparse_tfidf = tfidf.transform(documents_test)

In [None]:
scaler = MaxAbsScaler()

training_sparse_chat_based_scaled = scaler.fit_transform(training_sparse_chat_based)
test_sparse_chat_based_scaled = scaler.transform(test_sparse_chat_based)

In [None]:
chat_based_scaled_and_tfidf_training = sp.sparse.hstack((
    training_sparse_tfidf, training_sparse_chat_based_scaled))

chat_based_scaled_and_tfidf_test = sp.sparse.hstack((
    testing_sparse_tfidf, test_sparse_chat_based_scaled))

In [None]:
xgb_all_1 = xgb.XGBClassifier(max_depth=13, n_estimators=300, learning_rate=0.007, scale_pos_weight=5, gamma=7)
xgb_all_2 = xgb.XGBClassifier(max_depth=25, n_estimators=300)
xgb_all_3 = xgb.XGBClassifier(max_depth=5, n_estimators=220, learning_rate=0.007, scale_pos_weight=20, gamma=2)
xgb_all_4 = xgb.XGBClassifier(max_depth=3, n_estimators=400)
xgb_all_5 = xgb.XGBClassifier(max_depth=7, n_estimators=100)

xgb_all_1.fit(chat_based_scaled_and_tfidf_training, training[['is sexual predator']])
xgb_all_2.fit(chat_based_scaled_and_tfidf_training, training[['is sexual predator']])
xgb_all_3.fit(chat_based_scaled_and_tfidf_training, training[['is sexual predator']])
xgb_all_4.fit(chat_based_scaled_and_tfidf_training, training[['is sexual predator']])
xgb_all_5.fit(chat_based_scaled_and_tfidf_training, training[['is sexual predator']])

prediction0 = xgb_all_1.predict(chat_based_scaled_and_tfidf_test)
prediction1 = xgb_all_2.predict(chat_based_scaled_and_tfidf_test)
prediction2 = xgb_all_3.predict(chat_based_scaled_and_tfidf_test)
prediction3 = xgb_all_4.predict(chat_based_scaled_and_tfidf_test)
prediction4 = xgb_all_5.predict(chat_based_scaled_and_tfidf_test)

In [None]:
def bagging(vote_number, prediction_list):
    total_prediction = []
    for i in range(len(prediction_list[0])):
        voters = 0
        
        for prediction in prediction_list:
            if prediction[i] == 1:
                voters += 1
                
        if voters >= vote_number:
            total_prediction.append(1)
        else:
            total_prediction.append(0)
                
    return np.array(total_prediction)

In [None]:
xgb_all_prediction_bagged = bagging(4, [prediction0, prediction1, prediction2, prediction3, prediction4])

print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], xgb_all_prediction_bagged) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], xgb_all_prediction_bagged)
print 'Recall:', met.recall_score(test[['is sexual predator']], xgb_all_prediction_bagged)
print 'F1:', met.fbeta_score(test[['is sexual predator']], xgb_all_prediction_bagged, 1)
print 'F0.5:', met.fbeta_score(test[['is sexual predator']], xgb_all_prediction_bagged, 0.5)

In [None]:
rf_chat_based = RandomForestClassifier(n_estimators=230, n_jobs=8, bootstrap=False)
rf_chat_based = rf_chat_based.fit(chat_based_scaled_and_tfidf_training, np.ravel(training[['is sexual predator']]))

xgb_all = xgb.XGBClassifier(max_depth=14, n_estimators=300, learning_rate=0.2, scale_pos_weight=5, gamma=4,
                          min_child_weight=5, subsample=1)
xgb_all.fit(chat_based_scaled_and_tfidf_training, np.ravel(training[['is sexual predator']]))

xgb_chat_based = xgb.XGBClassifier(max_depth=25, n_estimators=300, learning_rate=0.007, scale_pos_weight=5,
                                   gamma=7, objective='binary:logistic')
xgb_chat_based.fit(training_sparse_chat_based_scaled, np.ravel(training[['is sexual predator']]))

svm_tfidf = svm.SVC(C=0.35,kernel='linear',max_iter=100)
svm_tfidf.fit(training_sparse_tfidf, np.ravel(training[['is sexual predator']]))

In [None]:
rf_chat_based_predicition = rf_chat_based.predict(chat_based_scaled_and_tfidf_test)
xgb_all_prediction = xgb_all.predict(chat_based_scaled_and_tfidf_test)
xgb_chat_based_prediction = xgb_chat_based.predict(test_sparse_chat_based_scaled)
svm_tfidf_predicition  = svm_tfidf.predict(testing_sparse_tfidf)

In [None]:
ensamble_prediction = bagging(2, [xgb_all_prediction_bagged, svm_tfidf_predicition, xgb_all_prediction])

In [None]:
print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], ensamble_prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], ensamble_prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], ensamble_prediction)
print 'F1:', met.fbeta_score(test[['is sexual predator']], ensamble_prediction, 1)
print 'F0.5:', met.fbeta_score(test[['is sexual predator']], ensamble_prediction, 0.5)

In [None]:
#data = {'p1':svm_tfidf_predicition, 'p2':rf_chat_based_predicition, 'p3':xgb_all_prediction_bagged,
#        'p4':xgb_all_prediction, 'p5':xgb_chat_based_prediction}
#s = pd.DataFrame(data)

#s.head()

In [None]:
clf1 = xgb.XGBClassifier(max_depth=14, n_estimators=300, learning_rate=0.2, scale_pos_weight=5, gamma=4,
                          min_child_weight=5, subsample=1)
clf2 = svm.SVC(C=0.35,kernel='linear',max_iter=100)
clf3 = RandomForestClassifier(n_estimators=230, n_jobs=8, bootstrap=False)

eclf = VotingClassifier(estimators=[('xgb', clf1), ('svm', clf2), ('rf', clf3)],
                        weights=[1, 1, 1])


features = ['number of conversation', 'percent of conversations started by the author', 'number of messages sent', 'average percent of lines in conversation', 'number of characters sent by the author']

clf1 = clf1.fit(chat_based_scaled_and_tfidf_training, np.ravel(training[['is sexual predator']]))
clf2 = clf2.fit(training[features], np.ravel(training[['is sexual predator']]))
clf3 = clf3.fit(training[features], np.ravel(training[['is sexual predator']]))

eclf = eclf.fit(training[features], np.ravel(training[['is sexual predator']]))

eclf_prediction = eclf.predict(test[features])


print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], eclf_prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], eclf_prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], eclf_prediction)
print 'F1:', met.fbeta_score(test[['is sexual predator']], eclf_prediction, 1)
print 'F0.5:', met.fbeta_score(test[['is sexual predator']], eclf_prediction, 0.5)

In [None]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = DecisionTreeClassifier(max_depth=4)
clf5 = KNeighborsClassifier(n_neighbors=7)
clf6 = SVC(kernel='rbf', probability=True)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gn', clf3),
        ('dt', clf4), ('knn', clf5), ('svc', clf6)], voting='soft', weights=[1,1,1,2,1,2])

features = ['number of conversation', 'percent of conversations started by the author', 'number of messages sent', 'average percent of lines in conversation', 'number of characters sent by the author']
clf1 = clf1.fit(training[features], np.ravel(training[['is sexual predator']]))
clf2 = clf2.fit(training[features], np.ravel(training[['is sexual predator']]))
clf3 = clf3.fit(training[features], np.ravel(training[['is sexual predator']]))
clf4 = clf4.fit(training[features], np.ravel(training[['is sexual predator']]))
clf5 = clf5.fit(training[features], np.ravel(training[['is sexual predator']]))
clf6 = clf6.fit(training[features], np.ravel(training[['is sexual predator']]))
eclf = eclf.fit(training[features], np.ravel(training[['is sexual predator']]))
eclf_prediction = eclf.predict(test[features])


print 'Accuracy: ', met.accuracy_score(test[['is sexual predator']], eclf_prediction) 
print 'Precision: ', met.precision_score(test[['is sexual predator']], eclf_prediction)
print 'Recall:', met.recall_score(test[['is sexual predator']], eclf_prediction)
print 'F1:', met.fbeta_score(test[['is sexual predator']], eclf_prediction, 1)

In [None]:
params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(iris.data, iris.target)