In [41]:
import sys
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import svm
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.cross_validation import train_test_split
sys.path.insert(0, '../../notebooks/libs/')
import FeatureExtraction as FE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [42]:
import sys
sys.path.insert(0, '../libs/')
%matplotlib inline

import FeatureExtraction
from lxml import etree

training_xml = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
test_xml = '../../dataset/test/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'

sexual_predator_ids_file = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt'

chat_based_features_csv_train='../../csv/chat_based_features_training.csv'
chat_based_features_csv_test='../../csv/chat_based_features_test.csv'

In [141]:
#tree_test=etree.parse(test_xml)
#silent_author_ids_test=FE.silent_author_ids(tree_test)
minimal_number_of_messages_treshold=3

In [3]:
documents_training=FE.prepare_for_tf_idf(training_xml,False,True,minimal_number_of_messages_treshold)


In [5]:
document_testing=FE.prepare_for_tf_idf(test_xml,False,True,minimal_number_of_messages_treshold)

In [6]:
print(len(documents_training))
print(len(document_testing))

22063
47458


In [148]:
tfidf=TfidfVectorizer(stop_words='english',min_df=3,max_features=5500,ngram_range=(1,2))
matrix_training=tfidf.fit_transform(documents_training)
matrix_testing=tfidf.transform(document_testing)

print(matrix_training.shape)
classifier=svm.SVC(C=0.35,kernel='linear',max_iter=100)
classifier.fit(matrix_training,y_train)
predicted_test=classifier.predict(matrix_testing)

write_statistics(y_test,predicted_test)

(22063, 5700)

test statistics
('acc', 0.99686038181128578)
('rec', 0.41666666666666669)
('prec', 0.91743119266055051)
('f1', 0.57306590257879664)
('f0.5', 0.73964497041420119)


In [143]:
train_frame=pd.DataFrame(pd.read_csv(chat_based_features_csv_train))
test_frame=pd.DataFrame(pd.read_csv(chat_based_features_csv_test))


In [144]:

train_frame=train_frame[train_frame['number of messages sent'] >=minimal_number_of_messages_treshold]
test_frame=test_frame[test_frame['number of messages sent']>=minimal_number_of_messages_treshold]
print(len(train_frame))
print(len(test_frame))

39763
87206


In [36]:
features=['number of conversation', 'percent of conversations started by the author', 'number of messages sent', 'average percent of lines in conversation', 'number of characters sent by the author']
#X_train_chat_based=train_frame.ix[:,features]
X_train_chat_based=train_frame.ix[:,1:-1]
y_train=np.ravel(train_frame[[-1]])
#X_test_chat_based=test_frame.ix[:,features]
X_test_chat_based=test_frame.ix[:,1:-1]
y_test=np.ravel(test_frame[[-1]])

In [None]:
X_train = sp.sparse.hstack((X_train_chat_based, matrix_training))
X_test = sp.sparse.hstack((X_test_chat_based, matrix_testing))
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

In [None]:

scaler=StandardScaler(with_mean=False)
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)


In [None]:

svmc=svm.SVC(kernel='linear',max_iter=100)
lrc=linear_model.LogisticRegression(n_jobs=8,max_iter=100000,penalty='l2')




In [None]:
#scaler=StandardScaler(with_mean=False)
#matrix_training=scaler.fit_transform(matrix_training)
#matrix_testing=scaler.transform(matrix_testing)

In [114]:
#linearsvm=svm.LinearSVC(max_iter=1000)
#ovaj samo na tf idf daje f-score 59 uz tf idf napravljen sa TfidfVectorizer(stop_words='english',min_df=3,max_features=3500)
#C=0.35 za fltrirane 1grame


(22063, 5100)


In [115]:
print(predicted_test[predicted_test==1])
print(predicted_test.shape)
print(y_test.shape)

write_statistics(y_test,predicted_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
(47458L,)
(47458L,)

test statistics
('acc', 0.9969868093893548)
('rec', 0.45000000000000001)
('prec', 0.90756302521008403)
('f1', 0.60167130919220047)
('f0.5', 0.75418994413407814)


In [39]:
#print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
def write_statistics(y_test,predicted_test):
    print("\ntest statistics")
    print('acc',accuracy_score(y_test,predicted_test))
    print('rec',recall_score(y_test,predicted_test))
    print('prec',precision_score(y_test,predicted_test))
    print('f1',f1_score(y_test,predicted_test))
    print('f0.5',fbeta_score(y_test,predicted_test,0.5))

In [16]:

from sklearn.metrics import zero_one_loss
def grid_search(model, X_train, X_validate, y_train, y_validate, crange, grange, error_surface=False):
    cs=[]
    i=crange[0]
    while i<=crange[1]:
        cs.append(2**i)
        i+=1
    gs=[]
    i=grange[0]
    while i<=grange[1]:
        gs.append(2**i)
        i+=1
    matrica_valid=np.zeros((len(cs),len(gs)))
    if(error_surface):
        matrica_train=np.zeros((len(cs),len(gs)))
    for i,c in enumerate(cs):
        for j,g in enumerate(gs):
            model.C=c
            model.gamma=g
            model.fit(X_train,y_train)
            matrica_valid[i,j]=zero_one_loss(y_validate,model.predict(X_validate))
            if(error_surface):
                matrica_train[i,j]=zero_one_loss(y_train,model.predict(X_train))
    flat_index=np.argmin(matrica_valid)
    indices=unravel_index(flat_index, matrica_valid.shape)
    kaomin=matrica_valid[indices]
    print("argmin",indices,"vrij",kaomin)
    print("Pogreska validacije za optimalne parametre err=",matrica_valid.min())
    if(error_surface):
        return ((cs[indices[0]],gs[indices[1]]),matrica_train,matrica_valid)
    return (cs[indices[0]],gs[indices[1]])
    
crange=(-5,15)
grange=(-15,3)


In [46]:
chat_scaler=StandardScaler()
X_train_chat_based=chat_scaler.fit_transform(X_train_chat_based)
X_test_chat_based=chat_scaler.transform(X_test_chat_based)

In [None]:
X2_train, X2_validate, y2_train, y2_validate = train_test_split(X_train_chat_based,y_train,train_size=0.5,random_state=55)
C_opt,g_opt=grid_search(svm.SVC(),X2_train, X2_validate, y2_train, y2_validate,crange,grange)

print(C_opt,g_opt)

In [48]:
classifier=svm.SVC(C=1,kernel='rbf',max_iter=100)
classifier.fit(X_train_chat_based,y_train)
predicted_test_chat=classifier.predict(X_test_chat_based)
write_statistics(y_test,predicted_test_chat)


test statistics
('acc', 0.50954528214421169)
('rec', 0.55000000000000004)
('prec', 0.005665236051502146)
('f1', 0.011214953271028037)
('f0.5', 0.007063356164383563)


NameError: name 'argmin' is not defined