In [41]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets.samples_generator import make_regression 
import pylab
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [42]:
data = pd.read_csv( 'adult.csv' )
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         48842 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [44]:
data = data[data.occupation != "?"]

In [45]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [46]:
selected_columns = data[ ['education', 'capital-gain', 'capital-loss'] ]

In [47]:
X = pd.get_dummies(selected_columns, columns = ['education'])
X.head()

Unnamed: 0,capital-gain,capital-loss,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,7688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
X.head()

Unnamed: 0,capital-gain,capital-loss,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,7688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [49]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [50]:
le.fit( data['income'] )

LabelEncoder()

In [51]:
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [52]:
le.transform( [ '<=50K', '>50K', '<=50K' ] )

array([0, 1, 0])

In [53]:
y = pd.Series( data = le.transform( data['income'] ) )
y.head()

0    0
1    0
2    1
3    1
4    0
dtype: int64

In [54]:
model1 = LogisticRegression()

In [55]:
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X, y, test_size = 0.9, random_state = 0)

In [56]:
model1.fit( X_train_fin, y_train_fin )


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Значение точности простой логистической регрессией

In [57]:
model1.score( X_val, y_val )

0.8001930967897659

In [58]:
cs = 10**np.linspace(-3,1,5)
cs

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])

In [59]:
grid = { 'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000] }#, 'max_features': features_num}
gridsearch = GridSearchCV(LogisticRegression(), grid, scoring='accuracy', cv=5)

In [60]:
gridsearch.fit(X_train_fin, y_train_fin)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [61]:
gridsearch.best_params_

{'C': 1, 'penalty': 'l1'}

In [62]:
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: 0.80252, std: 0.00875, params: {'C': 1, 'penalty': 'l1'},
 mean: 0.80252, std: 0.00845, params: {'C': 10, 'penalty': 'l1'},
 mean: 0.80252, std: 0.00845, params: {'C': 100, 'penalty': 'l1'},
 mean: 0.80252, std: 0.00845, params: {'C': 1000, 'penalty': 'l1'},
 mean: 0.80035, std: 0.00996, params: {'C': 10, 'penalty': 'l2'},
 mean: 0.80035, std: 0.00996, params: {'C': 1000, 'penalty': 'l2'},
 mean: 0.80013, std: 0.00993, params: {'C': 100, 'penalty': 'l2'},
 mean: 0.79818, std: 0.01023, params: {'C': 0.1, 'penalty': 'l2'},
 mean: 0.79818, std: 0.01071, params: {'C': 1, 'penalty': 'l2'},
 mean: 0.79796, std: 0.00902, params: {'C': 0.1, 'penalty': 'l1'},
 mean: 0.79535, std: 0.01333, params: {'C': 0.01, 'penalty': 'l1'},
 mean: 0.79492, std: 0.01335, params: {'C': 0.01, 'penalty': 'l2'},
 mean: 0.78775, std: 0.01439, params: {'C': 0.001, 'penalty': 'l2'},
 mean: 0.77667, std: 0.01050, params: {'C': 0.001, 'penalty': 'l1'}]

In [63]:
best_C = gridsearch.best_params_["C"]
best_penalty = gridsearch.best_params_["penalty"]

In [64]:
clf = LogisticRegression(C=best_C, penalty=best_penalty)

In [65]:
clf.fit(X_train_fin, y_train_fin)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [66]:
y_val_pred = clf.predict(X_val)

# Значение точности с гридсерчем

In [68]:
clf.score(X_val, y_val)

0.8059860004827419

In [70]:
from sklearn.tree import DecisionTreeClassifier

In [80]:
params = [ {'max_depth': list( range(1, 26) )} ]
gs = GridSearchCV(DecisionTreeClassifier(), param_grid = params, cv = 5, scoring = 'accuracy')
gs.fit( X_train_fin, y_train_fin )
best_depth = gs.best_params_['max_depth']

In [81]:
model_gs = DecisionTreeClassifier(max_depth=best_depth)

In [82]:
model_gs.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=18,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Конечное значение точности

In [85]:
model_gs.score(X_val, y_val)

0.8247646632874729