In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score
import pickle

In [2]:
columns=('age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income')

df_full_train = pd.read_csv("adult.data", names=columns)
df_test = pd.read_csv("adult.test", names=columns, skiprows=1)

df_full_train.columns = df_full_train.columns.str.lower().str.replace('-', '_')
df_test.columns = df_test.columns.str.lower().str.replace('-', '_')

categorical_columns = list(df_full_train.dtypes[df_full_train.dtypes == 'object'].index)
for col in categorical_columns:
    df_full_train[col] = df_full_train[col].str.lower().str.strip().str.replace('-', '_')
    df_test[col] = df_test[col].str.lower().str.strip().str.replace('-', '_')

df_full_train = df_full_train.replace('?', np.nan)
df_full_train = df_full_train.dropna()

df_full_train['income>50k'] = (df_full_train['income'] == '>50k').astype(int)
df_test['income>50k'] = (df_test['income'] == '>50k.').astype(int)

In [3]:
categorical = ['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country']
numerical = ['age', 'capital_gain', 'capital_loss','hours_per_week']

In [4]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

In [5]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X=dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [6]:
C = 1.0
n_splits = 5

In [7]:
Kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, test_idx in Kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[test_idx]

    y_train = df_train['income>50k'].values
    y_val = df_val['income>50k'].values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.903 +- 0.001


In [8]:
scores

[0.9051609305404646,
 0.9033004256530406,
 0.9009145170552344,
 0.9022591439696912,
 0.9040335346808751]

In [9]:
dv, model = train(df_full_train, df_full_train['income>50k'].values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test['income>50k'].values
auc = roc_auc_score(y_test, y_pred)
auc

0.901816424084386

### Save the model

In [10]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [11]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

### Load the model

In [12]:
input_file = 'model_C=1.0.bin'

In [13]:
with open(input_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [14]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [15]:
df_test[categorical + numerical][:10].to_dict(orient='records')

[{'workclass': 'private',
  'education': '11th',
  'marital_status': 'never_married',
  'occupation': 'machine_op_inspct',
  'relationship': 'own_child',
  'race': 'black',
  'sex': 'male',
  'native_country': 'united_states',
  'age': 25,
  'capital_gain': 0,
  'capital_loss': 0,
  'hours_per_week': 40},
 {'workclass': 'private',
  'education': 'hs_grad',
  'marital_status': 'married_civ_spouse',
  'occupation': 'farming_fishing',
  'relationship': 'husband',
  'race': 'white',
  'sex': 'male',
  'native_country': 'united_states',
  'age': 38,
  'capital_gain': 0,
  'capital_loss': 0,
  'hours_per_week': 50},
 {'workclass': 'local_gov',
  'education': 'assoc_acdm',
  'marital_status': 'married_civ_spouse',
  'occupation': 'protective_serv',
  'relationship': 'husband',
  'race': 'white',
  'sex': 'male',
  'native_country': 'united_states',
  'age': 28,
  'capital_gain': 0,
  'capital_loss': 0,
  'hours_per_week': 40},
 {'workclass': 'private',
  'education': 'some_college',
  'marita

In [16]:
person = {'workclass': 'private',
  'education': '11th',
  'marital_status': 'never_married',
  'occupation': 'machine_op_inspct',
  'relationship': 'own_child',
  'race': 'black',
  'sex': 'male',
  'native_country': 'united_states',
  'age': 25,
  'capital_gain': 0,
  'capital_loss': 0,
  'hours_per_week': 40}

In [17]:
X = dv.transform([person])

In [18]:
model.predict_proba(X)[0, 1]

0.0026219965599794184