# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Возьмём данные с семинара

In [1]:
import pandas as pd

bioresponse = pd.read_csv('../seminar01/bioresponse.csv', header=0, sep=',')

In [2]:
print(bioresponse.shape)
bioresponse.head(5)

(3751, 1777)


Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y = bioresponse.Activity.values

In [4]:
X = bioresponse.iloc[:, 1:]

## Вспомним разные модели с семинара

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_val_score

In [6]:
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    %time print(model, '\nAccuracy:', cross_val_score(model, X, y, cv=5).mean())
    print()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 
Accuracy: 0.7518013365357094
CPU times: user 2.92 s, sys: 2.12 s, total: 5.05 s
Wall time: 2.4 s

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 
Accuracy: 0.7480751086372301
CPU times: user 33.3 s, sys: 698 ms, total: 34 s
Wall time: 33.4 s

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 
Accuracy: 0.72380450572

# Задание

Используя класс для перебора параметров по сетке [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), напишите перебор параметров LogisticRegression:
- Параметр C по степеням 10 от -5 до +5
- Параметр max_iter от 100 до 500 включительно с шагом 100
- Параметр penalty либо 'l1' либо 'l2'
- Параметр random_state обязательно 42
- Параметр fit_intercept либо False либо True


In [5]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [11]:
param_grid = {
    'C':np.logspace(-5, 5, 11),
    'max_iter':np.arange(100,501,100),
    'penalty':['l1', 'l2'],
    'random_state':[42],
    'fit_intercept':[False, True]
}

print(round(GridSearchCV(
   LogisticRegression(),  param_grid,verbose=2, cv=5, n_jobs=-1
).fit(X, y).best_score_, 4))

Fitting 5 folds for each of 220 candidates, totalling 1100 fits
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV]  C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total=   0.3s
[CV] C=1e-05, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV]  C=1e-05, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.6s


[CV]  C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42, total=   0.2s
[CV] C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42, total=   0.3s
[CV]  C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42, total=   0.2s
[CV] C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, random_state=42 
[CV] C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=300, penalty=l2, random_state=42, total=   0.3s
[CV]  C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, random_state=42, total=   0.2s
[CV] C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, random_state=42 
[CV]  C=1e-05, fit_intercept=False, max_iter=400, penalty=l1, rando

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   18.9s


[CV]  C=0.0001, fit_intercept=False, max_iter=500, penalty=l2, random_state=42, total=   0.3s
[CV] C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42 
[CV]  C=0.0001, fit_intercept=False, max_iter=500, penalty=l2, random_state=42, total=   0.2s
[CV] C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42 
[CV]  C=0.0001, fit_intercept=False, max_iter=500, penalty=l2, random_state=42, total=   0.3s
[CV]  C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV] C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42 
[CV]  C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV] C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42 
[CV] C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42 
[CV]  C=0.0001, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.2s
[CV] C=0.0001, fit_intercept=True, max_iter=100, 

[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   45.9s


[CV]  C=0.01, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.5s
[CV] C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42 
[CV]  C=0.01, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.5s
[CV]  C=0.01, fit_intercept=False, max_iter=500, penalty=l2, random_state=42, total=   0.9s
[CV]  C=0.01, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.4s
[CV] C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42 
[CV]  C=0.01, fit_intercept=True, max_iter=100, penalty=l1, random_state=42, total=   0.5s
[CV] C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42 
[CV] C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42 
[CV]  C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42, total=   0.5s
[CV] C=0.01, fit_intercept=True, max_iter=100, penalty=l2, random_state=42 
[CV] C=0.01, fit_intercept=True, max_iter=200, penalty=l1, random_state=4

[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.5min


[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l2, random_state=42, total=   4.2s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l1, random_state=42 
[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l2, random_state=42, total=   3.8s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l1, random_state=42 
[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l1, random_state=42, total=  15.8s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l1, random_state=42 
[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l1, random_state=42, total=  12.8s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l1, random_state=42 
[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l2, random_state=42, total=   5.2s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l1, random_state=42 
[CV]  C=10.0, fit_intercept=False, max_iter=400, penalty=l1, random_state=42, total=  17.1s
[CV] C=10.0, fit_intercept=False, max_iter=500, penalty=l2, ran

[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 311.0min


[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l1, random_state=42, total= 7.3min
[CV] C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42, total=  41.7s
[CV] C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42, total=  37.4s
[CV] C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42, total=  31.3s
[CV] C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42, total=  51.4s
[CV] C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42 
[CV]  C=100000.0, fit_intercept=False, max_iter=100, penalty=l2, random_state=42, total=  34.8s
[CV] C=100000.0, fi

[Parallel(n_jobs=-1)]: Done 1100 out of 1100 | elapsed: 478.5min finished


0.7627


Полученное число введите в форму для ответов

In [None]:
GridSearchCV()