# Logisitc Regression Model using SK-Learn

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer

In [2]:
# Utility for displaying our DataFrames
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

### First let's import the CV exercise to train on...

In [3]:
trainX = pd.read_csv('../match-data/trainX.csv')
trainy = pd.read_csv('../match-data/trainy.csv')

clean = pd.read_csv('../match-data/clean_all.csv')

display_side_by_side(trainy, trainX)

Unnamed: 0,Al - Rawi Company for the manufacture of wires and cables Jordan
0,0
1,0
2,1
3,1
4,1
5,1
6,0
7,0
8,1
9,0

Unnamed: 0,nationality,gender,age,education,night_shift
0,syrian,female,48,secondary,0
1,syrian,female,40,diploma,0
2,syrian,female,25,bachelors,0
3,syrian,male,35,secondary,0
4,syrian,male,38,diploma,1
5,syrian,female,21,bachelors,1
6,syrian,female,44,diploma,0
7,syrian,female,53,none,0
8,syrian,female,20,secondary,0
9,syrian,male,44,primary,1


Next up we create some utilities that we will use later to one-hot encode our categorical features such as nationality, gender, etc.

In [4]:
def array_vector(col):
    return np.array(str(col))

arrayerize = np.vectorize(array_vector)

def one_hot_encode(df, column):
    # This is gross but since strings are iterable, we have to wrap them in a list
    # in order for the binarizer to parse the labels as strings and not chars
    labels = arrayerize(pd.DataFrame(df[column]))
    terms = arrayerize(pd.DataFrame(list(set(df[column]))))

    mlb = MultiLabelBinarizer()
    mlb.fit(terms)
    mlb.transform(labels)
    columns = [ f'{column}-{classname}' for classname in  mlb.classes_]

    encoded = pd.DataFrame(mlb.transform(labels), columns=columns, index=df[column].index)

    df.drop(column, axis=1, inplace=True)
    return df.join(encoded)

### Pre Process our training data

In [5]:
trainX = one_hot_encode(trainX, 'education')
trainX = one_hot_encode(trainX, 'gender')
trainX = one_hot_encode(trainX, 'nationality')
trainX['nationality-jordanian'] = 0
trainX['nationality-unknown'] = 0
trainX['education-college'] = 0
trainX['education-other'] = 0

In [6]:
trainX

Unnamed: 0,age,night_shift,education-bachelors,education-diploma,education-masters,education-none,education-primary,education-secondary,gender-female,gender-male,nationality-syrian,nationality-jordanian,nationality-unknown,education-college,education-other
0,48,0,0,0,0,0,0,1,1,0,1,0,0,0,0
1,40,0,0,1,0,0,0,0,1,0,1,0,0,0,0
2,25,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3,35,0,0,0,0,0,0,1,0,1,1,0,0,0,0
4,38,1,0,1,0,0,0,0,0,1,1,0,0,0,0
5,21,1,1,0,0,0,0,0,1,0,1,0,0,0,0
6,44,0,0,1,0,0,0,0,1,0,1,0,0,0,0
7,53,0,0,0,0,1,0,0,1,0,1,0,0,0,0
8,20,0,0,0,0,0,0,1,1,0,1,0,0,0,0
9,44,1,0,0,0,0,1,0,0,1,1,0,0,0,0


In [7]:
X = trainX
y = trainy.T.values[0]

model2 = LogisticRegression(max_iter=1000)
model2.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Pre Process our training data

In [8]:
clean = clean.drop(['Unnamed: 0', 'lat', 'lng'], axis=1)
clean = clean[clean['age'] != 0]
clean.dropna(inplace=True)

testX = clean.copy(deep=True)
testX = one_hot_encode(testX, 'education')
testX = one_hot_encode(testX, 'gender')
testX = one_hot_encode(testX, 'nationality')

In [9]:
testX

Unnamed: 0,age,night-shift,education-bachelors,education-college,education-diploma,education-masters,education-none,education-other,education-primary,education-secondary,gender-female,gender-male,nationality-jordanian,nationality-syrian,nationality-unknown
0,48,0.0,0,0,0,0,0,0,0,1,1,0,0,1,0
2,44,1.0,0,0,0,0,0,0,1,0,0,1,0,1,0
3,28,0.0,0,0,0,0,0,0,0,1,1,0,0,1,0
4,40,0.0,0,0,1,0,0,0,0,0,1,0,0,1,0
6,53,0.0,0,0,0,0,1,0,0,0,1,0,0,1,0
7,20,0.0,0,0,0,0,0,0,0,1,1,0,0,1,0
8,19,0.0,0,0,0,0,0,0,0,1,1,0,0,1,0
9,25,0.0,1,0,0,0,0,0,0,0,1,0,0,1,0
10,45,1.0,0,0,0,0,0,0,0,1,0,1,0,1,0
11,18,0.0,0,0,0,0,0,0,0,1,1,0,0,1,0


### And then lets take our set of real candidates and predict their probabilities

In [10]:

predicted = model2.predict(testX)
probs = model2.predict_proba(testX)

### Comparison of our predictions vs the true generated probabilities

In [11]:
display_side_by_side(pd.DataFrame(probs.T[1], columns=['candidate fit']), 
                     pd.DataFrame(clean))

Unnamed: 0,candidate fit
0,0.297379
1,0.167812
2,0.710096
3,0.295151
4,0.048137
5,0.831755
6,0.843684
7,0.736218
8,0.258193
9,0.854915

Unnamed: 0,nationality,gender,education,age,night-shift
0,syrian,female,secondary,48,0.0
2,syrian,male,primary,44,1.0
3,syrian,female,secondary,28,0.0
4,syrian,female,diploma,40,0.0
6,syrian,female,none,53,0.0
7,syrian,female,secondary,20,0.0
8,syrian,female,secondary,19,0.0
9,syrian,female,bachelors,25,0.0
10,syrian,male,secondary,45,1.0
11,syrian,female,secondary,18,0.0
