# Logisitc Regression Model using SK-Learn

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer

In [2]:
# Utility for displaying our DataFrames
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [3]:
from datetime import date, datetime

def calculate_age(born):
    today = date.today()
    if born in ['---', '']:
        return ''
    born = datetime.strptime(born, '%Y-%m-%d')
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

np_age = np.vectorize(calculate_age)

In [4]:
def age_concat(col1, col2):
    prohibited = ['', '---', 'nan']
    col1 = 0 if col1 in prohibited else float(col1)
    col2 = 0 if col2 in prohibited else float(col2)
    col1 = 0 if np.isnan(col1) else col1
    col2 = 0 if np.isnan(col2) else col2
    return col1 if col1 != 0 else col2

np_age_concat = np.vectorize(age_concat)

### First let's import the CV exercise to train on...

In [5]:
firms = pd.read_csv('../match-data/match-v1/firms.csv')
jobs = pd.read_csv('../match-data/match-v1/jobs.csv')
job_seekers = pd.read_csv('../match-data/match-v1/job-seekers.csv')

In [6]:
# Clean up nationality column
job_seekers['nationality_calc'] = job_seekers['nationality_calc'].replace(['---'], '')
job_seekers['nationality'] = job_seekers['nationality'].replace(['---'], '')
job_seekers['nationality'] = job_seekers['nationality'].map(str) + job_seekers['nationality_calc']
job_seekers['nationality'] = job_seekers['nationality'].replace([''], 'unknown')

In [7]:
# Clean up age column
job_seekers['age'] = job_seekers['age'].replace(['---'], '')
job_seekers['dob'] = np_age(job_seekers['dob'])
job_seekers['age'] = np_age_concat(job_seekers['age'], job_seekers['dob'])

In [8]:
job_seekers['will_work_qiz'] = job_seekers['will_work_qiz'].replace(['---'], 0)

In [9]:
job = jobs.loc[jobs['caseid'] == '7cbb757a-7b7e-42aa-adbe-513368f32c62']
firm = firms.loc[firms['caseid'] == jobs['indices.firm'].iloc[0]]
cvs = [
    firm['cv1_17'].iloc[0],
    firm['cv2_17'].iloc[0],
    firm['cv3_17'].iloc[0],
    firm['cv4_17'].iloc[0],
    firm['cv5_17'].iloc[0],
    firm['cv6_17'].iloc[0],
    firm['cv7_17'].iloc[0],
    firm['cv8_17'].iloc[0],
    firm['cv9_17'].iloc[0],
    firm['cv10_17'].iloc[0],
    firm['cv11_17'].iloc[0],
    firm['cv12_17'].iloc[0],
    firm['cv13_17'].iloc[0],
    firm['cv14_17'].iloc[0],
    firm['cv15_17'].iloc[0],
    firm['cv16_17'].iloc[0],
    firm['cv17_17'].iloc[0],
    firm['cv18_17'].iloc[0],
    firm['cv19_17'].iloc[0],
    firm['cv20_17'].iloc[0],
    firm['cv21_17'].iloc[0],
    firm['cv22_17'].iloc[0],
    firm['cv23_17'].iloc[0],
    firm['cv24_17'].iloc[0],
    firm['cv25_17'].iloc[0],
    firm['cv26_17'].iloc[0],
    firm['cv27_17'].iloc[0],
    firm['cv28_17'].iloc[0],
    firm['cv29_17'].iloc[0],
    firm['cv30_17'].iloc[0],
]

cvs = [ 0 if x == 'no' else 1 for x in cvs ]

trainX = pd.read_csv('../match-data/trainX.csv')
trainy = pd.DataFrame(cvs)

display_side_by_side(trainy, trainX)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,0
7,0
8,1
9,1

Unnamed: 0,nationality,gender,age,education,night_shift
0,syrian,female,48,secondary,0
1,syrian,female,40,diploma,0
2,syrian,female,25,bachelors,0
3,syrian,male,35,secondary,0
4,syrian,male,38,diploma,1
5,syrian,female,21,bachelors,1
6,syrian,female,44,diploma,0
7,syrian,female,53,none,0
8,syrian,female,20,secondary,0
9,syrian,male,44,primary,1


Next up we create some utilities that we will use later to one-hot encode our categorical features such as nationality, gender, etc.

In [10]:
def array_vector(col):
    return np.array(str(col))

arrayerize = np.vectorize(array_vector)

def one_hot_encode(df, column, labels_column=None, whitelist=[]):
    # This is gross but since strings are iterable, we have to wrap them in a list
    # in order for the binarizer to parse the labels as strings and not chars
    labels = arrayerize(pd.DataFrame(df[column]))
    terms = arrayerize(pd.DataFrame(list(set(df[column]))))

    mlb = MultiLabelBinarizer()
    mlb.fit(terms)
    mlb.transform(labels)
    columns = [ f'{column}-{classname}' for classname in  mlb.classes_]

    encoded = pd.DataFrame(mlb.transform(labels), columns=columns, index=df[column].index)

    df.drop(column, axis=1, inplace=True)
    return df.join(encoded)

In [11]:
l = arrayerize(list(set(trainX['education'])))

### Pre Process our training data

In [12]:
trainX = one_hot_encode(trainX, 'education')
trainX = one_hot_encode(trainX, 'gender')
trainX = one_hot_encode(trainX, 'nationality')
trainX['nationality-jordanian'] = 0
trainX['education-other'] = 0

In [13]:
trainX

Unnamed: 0,age,night_shift,education-bachelors,education-diploma,education-masters,education-none,education-primary,education-secondary,gender-female,gender-male,nationality-syrian,nationality-jordanian,education-other
0,48,0,0,0,0,0,0,1,1,0,1,0,0
1,40,0,0,1,0,0,0,0,1,0,1,0,0
2,25,0,1,0,0,0,0,0,1,0,1,0,0
3,35,0,0,0,0,0,0,1,0,1,1,0,0
4,38,1,0,1,0,0,0,0,0,1,1,0,0
5,21,1,1,0,0,0,0,0,1,0,1,0,0
6,44,0,0,1,0,0,0,0,1,0,1,0,0
7,53,0,0,0,0,1,0,0,1,0,1,0,0
8,20,0,0,0,0,0,0,1,1,0,1,0,0
9,44,1,0,0,0,0,1,0,0,1,1,0,0


In [14]:
sez = 1 if firm['sez_firm'].iloc[0] == 'yes' else 0
trainX['first_preference'] = job['job_description'].iloc[0]
trainX = one_hot_encode(trainX, 'first_preference')

In [15]:
normal = 0.5
kill = 1.0
weights = {
    'age': normal,
    'night_shift': kill,
    'education-bachelors': normal,
    'education-diploma': normal,
    'education-masters': normal,
    'education-none': normal,
    'education-primary': normal,
    'education-secondary': normal,
    'gender-female': normal,
    'gender-male': normal,
    'nationality-syrian': normal,
    'nationality-jordanian': normal,
    'nationality-unknown': normal,
    'education-college': normal,
    'education-other': normal,
    'will_work_qiz': kill,
    'first_preference-production': normal,
}

In [16]:
X = trainX
y = trainy.T.values[0]

model2 = LogisticRegression(max_iter=1000)
model2.fit(X, y)
model2.coef_[0][1] = 1.0
model2.coef_[0][-1] = 1.0
model2.coef_[0][-2] = 1.0
model2.coef_

model2.fit(X, y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
prefs =  [ 1 if x == job['job_description'].iloc[0] else 0 for x in job_seekers['first_job_field_preference']]

In [18]:
night_shift = firm['night_shifts_not_required'].iloc[0] == 'no'
night_shift

True

### Pre Process our training data

In [19]:
clean = pd.DataFrame()
clean['education'] = job_seekers['highest_edu_level']
clean['age'] = job_seekers['age']
clean['nationality'] = job_seekers['nationality']
clean['will_work_qiz'] = job_seekers['will_work_qiz']
clean['night_shift'] = job_seekers['will_work_night_shift']
clean['gender'] = job_seekers['gender']
clean[f'first_preference-' + job['job_description'].iloc[0]] = prefs

clean['education']
clean.loc[~clean["education"].isin(l), "education"] = "other"
if firm['sez_firm'].iloc[0] == 'yes':
    clean = clean[ clean['will_work_qiz'] == '1' ]
    
clean.dropna(inplace=True)

indices = list(clean.query("nationality not in ['syrian', 'jordanian']").index)
clean = clean[~clean.index.isin(indices)]

testX = clean.copy(deep=True)
testX = one_hot_encode(testX, 'education')
testX = one_hot_encode(testX, 'gender')
testX = one_hot_encode(testX, 'nationality')

In [20]:
# Get missing columns in the training test
missing_cols = set( trainX.columns ) - set( testX.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    testX[c] = 0
    
testX = testX[trainX.columns]

In [21]:
display(testX.columns)
display(trainX.columns)

Index(['age', 'night_shift', 'education-bachelors', 'education-diploma',
       'education-masters', 'education-none', 'education-primary',
       'education-secondary', 'gender-female', 'gender-male',
       'nationality-syrian', 'nationality-jordanian', 'education-other',
       'first_preference-production'],
      dtype='object')

Index(['age', 'night_shift', 'education-bachelors', 'education-diploma',
       'education-masters', 'education-none', 'education-primary',
       'education-secondary', 'gender-female', 'gender-male',
       'nationality-syrian', 'nationality-jordanian', 'education-other',
       'first_preference-production'],
      dtype='object')

### And then lets take our set of real candidates and predict their probabilities

In [22]:
predicted = model2.predict(testX)
probs = model2.predict_proba(testX)

### Comparison of our predictions vs the true generated probabilities

In [23]:
display_side_by_side(pd.DataFrame(probs.T[1], columns=['candidate fit']), 
                     pd.DataFrame(clean))

Unnamed: 0,candidate fit
0,0.789647
1,0.537852
2,0.91628
3,0.882185
4,0.611697
5,0.68758
6,0.538441
7,0.90784
8,0.843192
9,0.777083

Unnamed: 0,education,age,nationality,will_work_qiz,night_shift,gender,first_preference-production
55,secondary,19.0,jordanian,1,0.0,male,0
57,bachelors,50.0,jordanian,1,1.0,male,0
59,secondary,22.0,jordanian,1,1.0,male,1
60,secondary,36.0,syrian,1,0.0,male,1
62,primary,40.0,syrian,1,0.0,male,0
66,diploma,35.0,syrian,1,0.0,male,0
67,primary,50.0,syrian,1,0.0,female,1
68,secondary,30.0,syrian,1,0.0,male,1
69,other,20.0,syrian,1,1.0,male,0
70,bachelors,26.0,jordanian,1,1.0,male,0


In [24]:
clean['candidate_fit'] = pd.DataFrame(probs.T[1], columns=['candidate fit'])
clean.sort_values(by=['candidate_fit'], ascending=False)

Unnamed: 0,education,age,nationality,will_work_qiz,night_shift,gender,first_preference-production,candidate_fit
59,secondary,22.0,jordanian,1,1.0,male,1,0.922040
68,secondary,30.0,syrian,1,0.0,male,1,0.903944
57,bachelors,50.0,jordanian,1,1.0,male,0,0.892799
82,primary,21.0,jordanian,1,1.0,male,1,0.889675
74,secondary,20.0,jordanian,1,0.0,female,1,0.879093
83,secondary,20.0,jordanian,1,0.0,male,0,0.844362
70,bachelors,26.0,jordanian,1,1.0,male,0,0.823686
60,secondary,36.0,syrian,1,0.0,male,1,0.821178
102,secondary,35.0,syrian,1,0.0,male,0,0.799933
98,secondary,36.0,syrian,1,0.0,female,0,0.799933
