# Linear Models for Classification

# Exercise
Load and preprocess the adult data as before.
include dummy encoding and scaling
Learn a logistic regression model and visualize the coefficients.
Then grid-search the regularization parameter C.
compare L1 penalty to L2 penalty. how are the coefficients different?
which are the most important features?

In [6]:
import pandas as pd
adults = pd.read_csv("data/adult.csv", index_col=0)

In [None]:
# %load solutions/adult_classification.py

# Exercise - solution

In [45]:
# imports
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
adults.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# extract income column
income = adults['income']
data_features = adults.drop('income', axis=1)

In [8]:
# get dummies for categorical features
data_dummies = pd.get_dummies(data_features)
data_dummies.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,13,2174,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# create X and y vectors
X = data_dummies
y = income

In [13]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [14]:
# scale X train and test sets
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
# cross validation with default parameters
train_scores = cross_val_score(LogisticRegression(solver='lbfgs'), X_train_scaled, y_train, cv=5)
test_scores =   cross_val_score(LogisticRegression(solver='lbfgs'), X_test_scaled, y_test, cv=5)

print('Train cross validation score with default parameters: {}'.format(train_scores.mean()))
print('Test cross validation score with default parameters: {}'.format(test_scores.mean()))

Train cross validation score with default parameters: 0.8493332525483253
Test cross validation score with default parameters: 0.8508555293025617




In [32]:
# find best value of parameter C using Grid Search
param_grid = {'C': np.logspace(-3, 3, 7)}
grid = GridSearchCV(LogisticRegression(solver='lbfgs'), param_grid, cv=5)

In [34]:
# fit grid with unscaled data
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)



{'C': 0.01}
0.8345033345033345




In [35]:
# fit grid with scaled data
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)
print(grid.best_score_)

{'C': 100.0}
0.8495524745524745


In [46]:
# check accuracy of model with best parameters (grid)
grid.score(X_test_scaled, y_test)

0.8533114955471389

In [49]:
# predictions of model with best parameters
predictions = grid.predict(X_test_scaled)

In [51]:
# metrics
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[6914  522]
 [ 911 1422]]
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91      7436
        >50K       0.73      0.61      0.66      2333

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.85      0.85      9769

