# Introduction: Logistic Regression

The purpose of this notebook is to gain an understanding of the simple method of logistic regression. Logistic regression is generally considered the simplest method of classification, but can still be a powerful and explainable algorithm.

In [1]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import scipy

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.set_config_file(world_readable=True, theme="pearl")
cf.go_offline(connected=True)

# Extra options
pd.options.display.max_rows = 10
pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
data = pd.read_csv('data/adult.csv')


In [46]:
data = pd.read_csv('data/adult_income.csv')


def process_data(data):
    """
    Process the adult income dataset
    """
    data = data.copy()
    # Replace missing values
    data = data.replace({' ?': np.nan})
    
    # Code gender
    data['female'] = data['sex'].replace({' Male': 0, ' Female': 1})
    # Code target
    data['target'] = data['target'].replace({' >50K': 1, ' <=50K': 0})
    # Create single column for capital wealth
    data['capital'] = data['capital_gain'] - data['capital_loss']
    to_drop = ['country', 'education', 'sex', 
           'capital_gain', 'capital_loss', 
           'working_class',
          'race', 'occupation']
    # Remove excess columns
    data = data.drop(columns=to_drop)
    data = pd.get_dummies(data)
    return data

data = process_data(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 20 columns):
age                                      32561 non-null int64
weighting                                32561 non-null int64
education_num                            32561 non-null int64
hours_per_week_work                      32561 non-null int64
target                                   32561 non-null int64
female                                   32561 non-null int64
capital                                  32561 non-null int64
marital_status_ Divorced                 32561 non-null uint8
marital_status_ Married-AF-spouse        32561 non-null uint8
marital_status_ Married-civ-spouse       32561 non-null uint8
marital_status_ Married-spouse-absent    32561 non-null uint8
marital_status_ Never-married            32561 non-null uint8
marital_status_ Separated                32561 non-null uint8
marital_status_ Widowed                  32561 non-null uint8
relationship_ Husband  

In [47]:
corrs = data.corr()
corrs['target']

age                                      0.234037
weighting                               -0.009463
education_num                            0.335154
hours_per_week_work                      0.229689
target                                   1.000000
female                                  -0.215980
capital                                  0.214428
marital_status_ Divorced                -0.126995
marital_status_ Married-AF-spouse        0.012061
marital_status_ Married-civ-spouse       0.444696
marital_status_ Married-spouse-absent   -0.042532
marital_status_ Never-married           -0.318440
marital_status_ Separated               -0.074386
marital_status_ Widowed                 -0.064381
relationship_ Husband                    0.401035
relationship_ Not-in-family             -0.188497
relationship_ Other-relative            -0.083716
relationship_ Own-child                 -0.228532
relationship_ Unmarried                 -0.142857
relationship_ Wife                       0.123264


In [48]:
import plotly.figure_factory as ff
from plotly.offline import iplot

# Correlation Heatmap
iplot(ff.create_annotated_heatmap(corrs.round(3).values, x=list(corrs.columns), 
                                  y=list(corrs.index), annotation_text=corrs.round(3).values))


In [58]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

# Features and target
X = data.copy()
y = X.pop('target')

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the model
model = LogisticRegressionCV(Cs=10, cv = 3, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.4s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
           refit=True, scoring='roc_auc', solver='lbfgs', tol=0.0001,
           verbose=1)

## Metrics

In [59]:
from sklearn.metrics import f1_score, roc_auc_score

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, probabilities)
    f1_value = f1_score(y_test, predictions)
    accuracy = np.mean(predictions == y_test)
    base_accuracy = np.mean(y_test == 0)
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'F1 Score: {f1_value:.4f}')
    print(f'Accuracy: {100 * accuracy:.2f}%')
    print(f'Baseline Accuracy: {100 * base_accuracy:.2f}%')
    
evaluate(model, X_test, y_test)

ROC AUC: 0.5500
F1 Score: 0.3103
Accuracy: 79.59%
Baseline Accuracy: 75.97%


# Model Outputs

In [60]:
probability = model.predict_proba(X_test)[:, 1]
log_odds = model.decision_function(X_test)
classes = model.predict(X_test)

In [63]:
yhat = pd.DataFrame(dict(probability=probability, log_odds=log_odds, classes=classes))
yhat.describe()

Unnamed: 0,probability,log_odds,classes
count,13025.0,13025.0,13025.0
mean,0.278616,-0.891544,0.055585
std,0.148823,2.021135,0.229128
min,0.002541,-5.972546,0.0
25%,0.201376,-1.377718,0.0
50%,0.252283,-1.086475,0.0
75%,0.317065,-0.767294,0.0
max,1.0,24.621157,1.0


In [64]:
log_odds = yhat['log_odds']

odds_ratio = np.exp(log_odds)

probability = odds_ratio / (1 + odds_ratio)

np.allclose(probability, yhat['probability'].values)

True

In [65]:
log_odds = yhat['log_odds']

probability = 1 / (1 + np.exp(-log_odds))

np.allclose(probability, yhat['probability'].values)

True

In [66]:
yhat['odds_ratio'] = odds_ratio
yhat['classes'] = yhat['classes'].map({0: '< 50K', 1: '>= 50K'})

In [68]:
yhat.iplot(x='log_odds', y='probability', 
           yTitle='Probability', xrange=(-5, 5),
           title='Probability vs Log Odds', categories='classes')

In [71]:
yhat.iplot(x='log_odds', y='odds_ratio', 
           yTitle='odds_ratio', xrange=(-5, 5),
           title='Probability and Odds= Ratio vs Log Odds', categories='classes')

In [198]:
from sklearn.metrics import roc_auc_score, f1_score

roc_auc_score(y_test, yhat['probability'])
f1_score(y_test, yhat['probability'] > 0.5)

0.5131312206142761

0.28319327731092436

In [141]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
p = model.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, p)
f1_score(y_test, p > 0.5)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

0.8319439915212783

0.5811669555170422