# Домашнее задание к занятию "Алгоритмы классификации: линейные методы, логистическая регрессия и SVM"

#### Владимир Никифоров

Имеются данные adult.csv (см. в материалах к занятию)

Целевой переменной является уровень дохода income (крайний правый столбец).

Описание признаков можно найти по ссылке http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html

Вам необходимо построить модель логистической регрессии, которая предсказывает уровень дохода человека. При возможности попробуйте улучшить точность предсказаний (метод score) с помощью перебора признаков.

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
# turn off all warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# set constants
RANDOM_STATE = 777
N_FOLDS = 5

In [4]:
# load csv
raw_df = pd.read_csv('../data/adult.csv')

In [5]:
# view file
raw_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
# view statistic
raw_df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [7]:
# split to numeric and categorical columns
num_cols = ['age','fnlwgt','educational-num','capital-gain','capital-loss','hours-per-week']
cat_cols = raw_df.drop(num_cols+['income'], axis=1).columns
print(cat_cols)

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country'],
      dtype='object')


In [8]:
# create dummies (one-hot-encoding) for categorical features
df = pd.get_dummies(raw_df, columns = cat_cols)

In [9]:
# check new dummy-columns
df.columns

Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'income', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov',
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=109)

In [10]:
# check unique values in target-column
df['income'].unique()

array(['<=50K', '>50K'], dtype=object)

In [11]:
# transform target to binary column
df['income'] = df['income'].apply(lambda x: 0 if x == '<=50K' else 1)

In [12]:
# split dataframe to dependent and independent variables
X, Y = df.drop(['income'], axis=1), df['income']
# split to train and test frames
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = RANDOM_STATE)
# check their shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(34189, 108) (14653, 108) (34189,) (14653,)


In [13]:
# train and apply standartscaler to train and test frames
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
gridsearch = GridSearchCV(LogisticRegression(n_jobs=-1,random_state=RANDOM_STATE), {'penalty': ['l1','l2'], 'C': [0.01, 0.05]}, scoring='neg_log_loss', cv=N_FOLDS)
gridsearch.fit(X_train, y_train)
best_model = gridsearch.best_estimator_
y_pred = best_model.predict(X_test)

In [15]:
print(f"LogisticRegression score={best_model.score(X_test, y_test)}")

LogisticRegression score=0.8519074592233672


In [16]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91     11152
           1       0.73      0.60      0.66      3501

    accuracy                           0.85     14653
   macro avg       0.81      0.77      0.78     14653
weighted avg       0.85      0.85      0.85     14653

