In [66]:
import re
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit

%matplotlib inline

# Import Data

In [67]:
data = pd.read_csv('Desktop/ML/data/Task5.2/train.csv')
data.drop('id', axis = 1, inplace = True)
X_train = np.asarray(data[data.columns[range(1, data.shape[1])]], dtype = np.double)
Y_train = np.asarray(data[['label']], dtype = np.double).ravel()
data.head()

Unnamed: 0,label,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,1,7,0,3,0,2,3,0,6,0,...,3,4,2,2,0,13,0,11,1,3
1,1,0,11,0,0,10,1,0,0,4,...,0,2,0,0,2,8,1,13,0,4
2,0,9,0,3,0,1,3,0,4,0,...,48,11,2,0,0,4,0,2,0,0
3,0,0,9,3,2,25,0,4,0,0,...,1,14,1,0,0,0,3,0,17,1
4,0,0,0,0,0,2,5,0,0,0,...,3,12,0,3,0,4,0,24,4,0


In [68]:
data.groupby(['label']).count()

Unnamed: 0_level_0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,327,327,327,327,327,327,327,327,327,327,...,327,327,327,327,327,327,327,327,327,327
1,573,573,573,573,573,573,573,573,573,573,...,573,573,573,573,573,573,573,573,573,573


# Fit Methods

In [69]:
estimators = [
    SGDClassifier(max_iter=5000, tol=0.1, penalty='l1', alpha=0.1),
    SVC(kernel='rbf', C = 0.1, gamma=2),
    LinearSVC(penalty = 'l2', dual=False), 
    LogisticRegression(penalty='l1', C=0.2),
]

In [70]:
cv_strategy = ShuffleSplit(n_splits=10, test_size=0.2, random_state=33)
for estimator in estimators:
    %time scoring = cross_val_score(estimator, X_train, Y_train, scoring='roc_auc', cv=cv_strategy)
    print ('%s %.3lf' % (str(estimator).replace('(',' ').split()[0], scoring.mean()), '%.4lf' % scoring.std())

CPU times: user 438 ms, sys: 3.17 ms, total: 441 ms
Wall time: 439 ms
SGDClassifier 0.858 0.0260
CPU times: user 1.54 s, sys: 6.32 ms, total: 1.55 s
Wall time: 1.55 s
SVC 0.877 0.0171
CPU times: user 4.55 s, sys: 16.4 ms, total: 4.57 s
Wall time: 4.58 s
LinearSVC 0.913 0.0149
CPU times: user 887 ms, sys: 16.2 ms, total: 904 ms
Wall time: 899 ms
LogisticRegression 0.923 0.0189


In [71]:
C_var = [0.2 * i for i in range(1, 5, 1)]
max_iter_var = range(40, 200, 20)
estimators = []

for c in C_var:
    for m in max_iter_var:
        estimators.append(LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=c, fit_intercept=True,
                                             intercept_scaling=1, class_weight=None, random_state=None,
                                             solver='liblinear', max_iter=m, multi_class='ovr', 
                                             verbose=0, warm_start=False, n_jobs=1))

In [73]:
answer1 = []
answer2 = []
ans_estimator = []

cv_strategy = ShuffleSplit(n_splits=5, test_size=0.3, random_state=33)
for estimator in estimators:
    scoring = cross_val_score(estimator, X_train, Y_train, cv=cv_strategy)
    answer1.append(scoring.mean())
    answer2.append(scoring.std())
    ans_estimator.append(estimator)

In [74]:
print(answer1[answer1.index(max(answer1))])

0.888888888888889


In [77]:
indexes = []

for i in range(len(answer1)):
    if answer1[i] > 0.8:
        indexes.append(i)

minimum = 1
ind_min = None
for ind in indexes:
    if answer2[ind] < minimum:
        minimum = answer2[ind]
        ind_min = ind
        
print(ind_min)
print(answer2[ind_min])

8
0.022148024278112678


# Get final classifier

In [78]:
estimator = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                               intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
                               penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
                               verbose=0, warm_start=False)
estimator.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Test data estimation

In [79]:
test = pd.read_csv('Desktop/ML/data/Task5.2/test.csv')
X_test = np.asarray(test[test.columns[range(1, test.shape[1])]])

In [80]:
X_test

array([[ 4,  2,  2, ...,  6,  2,  3],
       [ 3,  3,  0, ..., 27,  2,  0],
       [ 0,  0,  0, ...,  5,  1,  1],
       ...,
       [ 1,  1,  0, ..., 36,  0,  2],
       [ 1,  3,  2, ..., 35,  1,  0],
       [ 0,  2,  3, ..., 15,  0,  0]])

In [81]:
test_predict=estimator.predict(X_test)

In [82]:
def submit(preds):
    submission = pd.read_csv("Desktop/ML/data/Task5.2/sample_submission.csv")
    submission['label'] = preds
    submission.to_csv("Desktop/ML/data/Task5.2/submission4.csv", index=False)

In [83]:
submit(test_predict)

In [84]:
!head Desktop/ML/data/Task5.2/submission4.csv

id,label
1,1.0
2,1.0
3,0.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0
