# Домашнее задание по теме "Оценка точности модели, переобучение, регуляризация"

#### Владимир Никифоров

Дана статистика пользователей adult.csv.

Получите значения AUC для различных моделей и их параметров.

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# turn off all warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# set constants
RANDOM_STATE = 777
N_FOLDS = 5

In [4]:
# load csv
raw_df = pd.read_csv('../data/adult.csv')

In [5]:
# view file
raw_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
# view statistic
raw_df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [7]:
# split to numeric and categorical columns
num_cols = ['age','fnlwgt','educational-num','capital-gain','capital-loss','hours-per-week']
cat_cols = raw_df.drop(num_cols+['income'], axis=1).columns
print(cat_cols)

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country'],
      dtype='object')


In [8]:
# create dummies (one-hot-encoding) for categorical features
df = pd.get_dummies(raw_df, columns = cat_cols)

In [9]:
# check new dummy-columns
df.columns

Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'income', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=109)

In [10]:
# check unique values in target-column
df['income'].unique()

array(['<=50K', '>50K'], dtype=object)

In [11]:
# transform target to binary column
df['income'] = df['income'].apply(lambda x: 0 if x == '<=50K' else 1)

In [12]:
# split dataframe to dependent and independent variables
X, Y = df.drop(['income'], axis=1), df['income']
# split to train and test frames
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = RANDOM_STATE)
# check their shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(34189, 108) (14653, 108) (34189,) (14653,)


In [13]:
# train and apply standartscaler to train and test frames
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
def get_auc(X_train, y_train, X_test, y_test, model, grid_params, n_folds = N_FOLDS):
    ''' Function to train input model with grid_params '''
    # define grid with cross-validation
    gridsearch = GridSearchCV(model, grid_params, scoring='neg_log_loss', cv=n_folds)
    # fit grid
    gridsearch.fit(X_train, y_train)
    # get best model from grid
    best_model = gridsearch.best_estimator_
    # get prediction of best model
    y_pred = best_model.predict_proba(X_test)[:, 1]
    # get roc_curve of prediction and y_test
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    # get auc of prediction
    l_auc = auc(fpr, tpr)
    print(best_model,'\nAUC=',l_auc)
    print('*'*50)
    return l_auc

In [15]:
test_auc = get_auc(X_train, y_train, X_test, y_test, LogisticRegression(n_jobs=-1,random_state=RANDOM_STATE), {'penalty': ['l1','l2'], 'C': [0.01, 0.05]})

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l1',
                   random_state=777, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 
AUC= 0.9067854075920921
**************************************************


In [16]:
models = [(LogisticRegression(n_jobs=-1,random_state=RANDOM_STATE), {'penalty': ['l1','l2'], 'C': [0.01, 0.05]}),
          (DecisionTreeClassifier(random_state=RANDOM_STATE), {'max_depth': [5,7,10,12,15]}),
          (RandomForestClassifier(n_jobs=-1,random_state=RANDOM_STATE), {'max_depth': [5,7,10,12,15], 'n_estimators': [10,20,50,100], 'max_features': [0.2,0.5,0.7,0.8]})
         ]

In [17]:
models_aucs = {}
for mdl, params in models:
    %time models_aucs[mdl] = get_auc(X_train, y_train, X_test, y_test, mdl, params)

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l1',
                   random_state=777, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 
AUC= 0.9067854075920921
**************************************************
CPU times: user 29.4 s, sys: 8.32 s, total: 37.8 s
Wall time: 22.5 s
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=777, splitter='best') 
AUC= 0.8852109455711977
**************************************************
CPU times: user 6.59 s, sys: 356 ms, total: 6.95 s
Wall

### Final AUC of best models

In [18]:
models_aucs

{LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=-1, penalty='l2',
                    random_state=777, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False): 0.9067854075920921,
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=777, splitter='best'): 0.8852109455711977,
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurit