In [109]:
import sys
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import svm
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.cross_validation import train_test_split
sys.path.insert(0, '../../notebooks/libs/')
import FeatureExtraction as FE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [10]:
import sys
sys.path.insert(0, '../libs/')
%matplotlib inline

import FeatureExtraction
from lxml import etree

training_xml = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
test_xml = '../../dataset/test/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'

sexual_predator_ids_file = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt'

chat_based_features_csv_train='../../csv/chat_based_features_training.csv'
chat_based_features_csv_test='../../csv/chat_based_features_test.csv'

In [11]:
def write_statistics(y_test,predicted_test):
    print("\ntest statistics")
    print('acc',accuracy_score(y_test,predicted_test))
    print('rec',recall_score(y_test,predicted_test))
    print('prec',precision_score(y_test,predicted_test))
    print('f1',f1_score(y_test,predicted_test))
    print('f0.5',fbeta_score(y_test,predicted_test,0.5))

In [4]:
minimal_number_of_messages_treshold=5

In [5]:
documents_training=FE.prepare_for_tf_idf(training_xml,False,True,minimal_number_of_messages_treshold)
document_testing=FE.prepare_for_tf_idf(test_xml,False,True,minimal_number_of_messages_treshold)

In [7]:
train_frame=pd.DataFrame(pd.read_csv(chat_based_features_csv_train))
test_frame=pd.DataFrame(pd.read_csv(chat_based_features_csv_test))
train_frame=train_frame[train_frame['number of messages sent'] >=minimal_number_of_messages_treshold]
test_frame=test_frame[test_frame['number of messages sent']>=minimal_number_of_messages_treshold]
print(len(train_frame))
print(len(test_frame))
X_train_chat_based=train_frame.ix[:,1:-1]
y_train=np.ravel(train_frame[[-1]])
X_test_chat_based=test_frame.ix[:,1:-1]
y_test=np.ravel(test_frame[[-1]])

22063
47458


In [8]:
tfidf=TfidfVectorizer(stop_words='english',min_df=3,max_features=3500)
matrix_training=tfidf.fit_transform(documents_training)
matrix_testing=tfidf.transform(document_testing)

In [101]:
classifier=svm.SVC(C=0.7454,kernel='linear',max_iter=100)
classifier.fit(matrix_training,y_train)
predicted_test=classifier.predict(matrix_testing)
write_statistics(y_test,predicted_test)


test statistics
('acc', 0.9969868093893548)
('rec', 0.45833333333333331)
('prec', 0.89430894308943087)
('f1', 0.60606060606060608)
('f0.5', 0.75136612021857929)


In [133]:
Cs={'C':[x for x in np.linspace(0.5,1.5,100)]}
dic_for_pipe={'svc':[Cs]}
print(Cs)

{'C': [0.5, 0.51010101010101006, 0.52020202020202022, 0.53030303030303028, 0.54040404040404044, 0.5505050505050505, 0.56060606060606055, 0.57070707070707072, 0.58080808080808077, 0.59090909090909094, 0.60101010101010099, 0.61111111111111116, 0.62121212121212122, 0.63131313131313127, 0.64141414141414144, 0.65151515151515149, 0.66161616161616166, 0.67171717171717171, 0.68181818181818188, 0.69191919191919193, 0.70202020202020199, 0.71212121212121215, 0.72222222222222221, 0.73232323232323238, 0.74242424242424243, 0.7525252525252526, 0.76262626262626265, 0.77272727272727271, 0.78282828282828287, 0.79292929292929293, 0.80303030303030298, 0.81313131313131315, 0.82323232323232332, 0.83333333333333337, 0.84343434343434343, 0.85353535353535359, 0.86363636363636365, 0.8737373737373737, 0.88383838383838387, 0.89393939393939403, 0.90404040404040409, 0.91414141414141414, 0.92424242424242431, 0.93434343434343436, 0.94444444444444442, 0.95454545454545459, 0.96464646464646475, 0.97474747474747481, 0.98

In [134]:
linsvm= svm.SVC(kernel='linear',max_iter=100,random_state=5)
est=clf = make_pipeline(StandardScaler(with_mean=False), linsvm)

print('koraci',est.steps)
#grid_search=GridSearchCV(estimator=est,param_grid=dic_for_pipe,n_jobs=8)
grid_search=GridSearchCV(estimator=linsvm,param_grid=Cs,n_jobs=8)
grid_search

('koraci', [('standardscaler', StandardScaler(copy=True, with_mean=False, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=100, probability=False, random_state=5, shrinking=True,
  tol=0.001, verbose=False))])


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=100, probability=False, random_state=5, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'C': [0.5, 0.51010101010101006, 0.52020202020202022, 0.53030303030303028, 0.54040404040404044, 0.5505050505050505, 0.56060606060606055, 0.57070707070707072, 0.58080808080808077, 0.59090909090909094, 0.60101010101010099, 0.61111111111111116, 0.62121212121212122, 0.63131313131313127, 0.641...4949494949496, 1.4595959595959598, 1.4696969696969697, 1.4797979797979799, 1.4898989898989901, 1.5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [None]:
grid_search.fit(matrix_training,y_train)
pred=grid_search.predict(matrix_testing)
write_statistics(y_test,pred)

In [None]:
grid_search.best_estimator_