In [33]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
a = pd.DataFrame([[np.nan, 2, 3], [4, np.nan, 6]])

a[0] = a[0].fillna(0)

In [3]:
def array_vector(col):
    return np.array(str(col))

arrayerize = np.vectorize(array_vector)

def one_hot_encode(df, column, labels_column=None, whitelist=[]):
    # This is gross but since strings are iterable, we have to wrap them in a list
    # in order for the binarizer to parse the labels as strings and not chars
    labels = arrayerize(pd.DataFrame(df[column]))
    terms = arrayerize(pd.DataFrame(list(set(df[column]))))

    mlb = MultiLabelBinarizer()
    mlb.fit(terms)
    mlb.transform(labels)
    columns = [ f'{column}-{classname}' for classname in  mlb.classes_]

    encoded = pd.DataFrame(mlb.transform(labels), columns=columns, index=df[column].index)

    df.drop(column, axis=1, inplace=True)
    return df.join(encoded)

In [4]:
firms = pd.read_csv('../match-data/match-v1/firms.csv')
jobs = pd.read_csv('../match-data/match-v1/job-openings.csv')
jobs_parent = pd.read_csv('../match-data/match-v1/job-openings-parent.csv')
job_seekers = pd.read_csv('../match-data/match-v1/job-seekers.csv')
matches = pd.read_csv('../match-data/match-v1/matches.csv')
matches_parent = pd.read_csv('../match-data/match-v1/matches-parent.csv')


matches_merged = pd.merge(matches, matches_parent, on='number')
jobs_merged = pd.merge(jobs, jobs_parent, on='number')

In [5]:
categorical_columns = ["gender", "highest_edu_level", "nationality", "gendermix_not_allowed", "benefit1", "benefit2", "city", "english_proficiency", "impairments", "major", "opposite_gender_coworkers", "opposite_gender_manager", "first_job_field_preference", "second_job_field_preference", "bus_covered", "childcare_subsidy_offered", "dorm_covered", "driving_ability_required", "education_required", "english_proficiency_required", "female_requied", "free_meals_at_wok", "health_insurance_offered", "hearing_disability_accepted", "housing_subsidy_offered", "it_proficiency_required", "job_category", "job_description", "job_production", "jordanian_experience_required", "literacy_required", "male_required", "meal_subsidy_offered", "night_shifts_required", "noncognitive_skill_preference1", "noncognitive_skill_preference2", "numeracy_requied", "physical_disability_accepted", "physical_work_abilities_required", "problem_solving_required", "school_subsidy_offered", "specialization_required", "speech_disability_accepted", "syrian_considered", "transport_subsidy_offered", "visual_disability_accepted", "work_permit_offered",]
all_columns = ["age", "gender", "highest_edu_level", "will_work_night_shift", "nationality", "gendermix_not_allowed", "will_work_qiz", "arab_coworkers", "benefit1", "benefit2", "city", "daily_hours_willing_to_work", "days_willing_train_unpaid", "distance_willing_to_travel", "english_proficiency", "experience_clerical_work", "experience_factory", "experience_management_work", "experience_manual_labor", "experience_professional_work", "follow_up_agreement", "has_job", "impairments", "major", "nonarab_coworkers", "opposite_gender_coworkers", "opposite_gender_manager", "weekly_days_willing_to_work", "will_live_in_dorm", "will_train_unpaid", "years_education", "years_exp", "first_job_field_preference", "rwage1", "second_job_field_preference", "hh_income", "hired_yes_no", "quit", "fired", "interest_applying", "num_children", "personal_income", "bus_covered", "childcare_subsidy_offered", "dorm_covered", "driving_ability_required", "education_required", "english_proficiency_required", "female_requied", "free_meals_at_wok", "health_insurance_offered", "hearing_disability_accepted", "housing_subsidy_offered", "it_proficiency_required", "job_category", "job_description", "job_production", "jordanian_experience_required", "literacy_required", "male_required", "meal_subsidy_offered", "night_shifts_required", "noncognitive_skill_preference1", "noncognitive_skill_preference2", "num_vacancies", "numeracy_requied", "physical_disability_accepted", "physical_work_abilities_required", "problem_solving_required", "school_subsidy_offered", "specialization_required", "speech_disability_accepted", "syrian_considered", "transport_subsidy_offered", "visual_disability_accepted", "wage_offered", "work_permit_offered", "years_experience_required",]
scalar_columns = ["age", "daily_hours_willing_to_work", "days_willing_train_unpaid", "distance_willing_to_travel", "years_education", "years_exp", "rwage1", "hh_income", "num_children", "personal_income", "num_vacancies", "wage_offered"]

In [6]:
job_seekers['parent_case_id'] = job_seekers['caseid']

In [7]:
merged = pd.merge(job_seekers, matches_merged, on='parent_case_id')
merged = pd.merge(merged, jobs_merged, on='job_id')

merged.to_csv('../match-data/match-v1/merged.csv')

In [8]:
formatted = pd.DataFrame()
for col in all_columns:
    formatted[col] = merged[col]

dvs = ['hired_yes_no', 'quit', 'fired']

for col in all_columns:
    if col not in scalar_columns and col not in dvs:
        formatted = one_hot_encode(formatted, col)

for col in formatted.columns:
    # TODO: Change this
    formatted[col] = formatted[col].replace(['---'], 0)
    
formatted.to_csv('../match-data/match-v1/formatted.csv')

In [9]:
formatted['hired_yes_no'] = formatted['hired_yes_no'].fillna(0)
formatted['hired_yes_no'] = formatted['hired_yes_no'].replace(['---'], 0)

formatted['quit'] = formatted['quit'].fillna(0)
formatted['quit'] = formatted['quit'].replace(['---'], 0)
formatted['quit'] = formatted['quit'].replace(['no'], 0)
formatted['quit'] = formatted['quit'].replace(['yes'], 1)

formatted['fired'] = formatted['fired'].fillna(0)
formatted['fired'] = formatted['fired'].replace(['---'], 0)
formatted['fired'] = formatted['fired'].replace(['no'], 0)
formatted['fired'] = formatted['fired'].replace(['yes'], 1)

formatted['hired_yes_no'] = formatted['hired_yes_no'].astype(bool)
formatted['quit'] = formatted['quit'].astype(bool)
formatted['fired'] = formatted['fired'].astype(bool)

y = pd.DataFrame()
y['retained'] = formatted['hired_yes_no'] & ~(formatted['quit'] | formatted['fired'])
#y['hired'] = formatted['hired_yes_no']

In [10]:
formatted = formatted.drop(dvs, axis=1)

In [11]:
to_drop = []
for col in formatted.columns:
    if col in scalar_columns:
        formatted[col] = formatted[col].astype(float)
        mean = formatted[col].mean()
        formatted[col] = formatted[col].replace(['---'], mean)
        formatted[col] = formatted[col].fillna(mean)
    else:
        formatted[col] = formatted[col].fillna(0)
        formatted[col] = formatted[col].astype(int)
        formatted[col] = formatted[col].replace(['---'], 0)
    
    if col.endswith('---'):
        to_drop.append(col)

formatted = formatted.drop(columns=to_drop)

In [12]:
y=y.astype('int')
y.to_csv('../match-data/match-v1/y.csv')

In [13]:
formatted.to_csv('../match-data/match-v1/X.csv')
X = formatted

In [14]:
print(X.shape, y.shape)

(112, 245) (112, 1)


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier
scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    regr = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)
    regr.fit(X_train, y_train)
    s = regr.score(X_test, y_test)
    scores.append(s)
    
sum(scores) / len(scores)

0.7654054054054051

# GBRT

In [36]:
scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

sum(scores) / len(scores)

0.7218918918918917

## LASSO CV

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
reg = LassoCV(cv=3, normalize=True).fit(X_train, y_train)
reg.score(X_test, y_test)

0.18148973075401953

In [17]:
importances = {}
for i, importance in enumerate(regr.feature_importances_):
    importances[X.columns[i]] = importance
    
importances

c = {}
for k, v in importances.items():
    c[k] = [v]
    

importance_frame = pd.DataFrame.from_dict(c)
importance_frame.to_csv('../match-data/match-v1/random-forest-importance.csv')

# K Best Feature Elimination

In [18]:
from sklearn.feature_selection import SelectKBest

# Create and fit selector
selector = SelectKBest(chi2, k=10)
selector.fit(X, y)
# Get idxs of columns to keep
idxs_selected = selector.get_support(indices=True)
cols = []
for i, c in enumerate(X.columns):
    if i in idxs_selected:
        cols.append(c)

### Accuracy using top 10 k-best features

In [38]:
model = LogisticRegressionCV(max_iter=1000)

scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    XX_train = pd.DataFrame()
    XX_test = pd.DataFrame()
    for c in cols:
        XX_train[c] = X_train[c]
        XX_test[c] = X_test[c]

    model.fit(XX_train, y_train)
    scores.append(model.score(XX_test, y_test))

sum(scores) / len(scores)

0.7216216216216215

### Export Coefs & p-values to csv

In [20]:
coef_dict = {}
for coef, feat in zip(model.coef_[0], formatted.columns):
    coef_dict[feat] = coef

In [21]:
c = {}
for k, v in coef_dict.items():
    c[k] = [v]
    

coef_frame = pd.DataFrame.from_dict(c)
sorted_frame = coef_frame.columns[coef_frame.ix[coef_frame.last_valid_index()].argsort()]
coef_frame.to_csv('../match-data/match-v1/coefs.csv')

odds_ratios = np.exp(coef_frame)

In [22]:
scores, pvalues = chi2(formatted, y)

In [23]:
p_dict = {}
for pvalue, feat in zip(pvalues, formatted.columns):
    p_dict[feat] = pvalue
    
c = {}
for k, v in p_dict.items():
    c[k] = [v]
    

pval_frame = pd.DataFrame.from_dict(c)
sorted_frame = pval_frame.columns[pval_frame.ix[pval_frame.last_valid_index()].argsort()]
pval_frame.to_csv('../match-data/match-v1/pvalues.csv')

# Recursive Feature Elimination

In [24]:
model = LogisticRegressionCV(max_iter=1000, multi_class='multinomial', solver='lbfgs')
from sklearn.feature_selection import RFE
selector = RFE(model, 10, step=1)
selector.fit(X, y)

RFE(estimator=LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000,
           multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0),
  n_features_to_select=10, step=1, verbose=0)

In [25]:
len(selector.support_)
features = []
for i, v in enumerate(selector.support_):
    if v:
        features.append(X.columns[i])

features

['nationality-jordanian',
 'benefit2-work_meals',
 'experience_management_work-nan',
 'second_job_field_preference-food',
 'interest_applying-1',
 'health_insurance_offered-no',
 'noncognitive_skill_preference2-agreeableness',
 'physical_work_abilities_required-no',
 'physical_work_abilities_required-yes',
 'work_permit_offered-no']

### Accuracy using top 10 RFE features

In [26]:
model = LogisticRegressionCV(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
XX = pd.DataFrame()
for c in features:
    XX[c] = X[c]

model.fit(XX, y)
model.score(XX, y)

0.6428571428571429

In [27]:
display(coef_frame)
display(pval_frame)
display(odds_ratios)

Unnamed: 0,age,daily_hours_willing_to_work,days_willing_train_unpaid,distance_willing_to_travel,years_education,years_exp,rwage1,hh_income,num_children,personal_income
0,-0.022099,0.002932,0.004344,0.04088,0.035615,1.982494,0.36799,0.304175,3.373916,0.378986


Unnamed: 0,age,daily_hours_willing_to_work,days_willing_train_unpaid,distance_willing_to_travel,years_education,years_exp,rwage1,hh_income,num_children,personal_income,...,syrian_considered-yes,transport_subsidy_offered-no,transport_subsidy_offered-yes,visual_disability_accepted-no,work_permit_offered-no,work_permit_offered-yes,years_experience_required-0.0,years_experience_required-1.0,years_experience_required-2.0,years_experience_required-nan
0,0.00011,0.194796,4.206728e-11,0.060689,0.464084,0.001722,0.812247,0.0,0.057247,3.7934420000000005e-222,...,0.95998,0.770509,0.22016,0.01539,0.606397,0.599926,0.757844,0.563093,0.395136,0.395136


Unnamed: 0,age,daily_hours_willing_to_work,days_willing_train_unpaid,distance_willing_to_travel,years_education,years_exp,rwage1,hh_income,num_children,personal_income
0,0.978143,1.002937,1.004353,1.041727,1.036257,7.260829,1.444827,1.355506,29.192623,1.460802


In [28]:
combined = coef_frame.append(pval_frame).append(np.exp(odds_ratios))
combined.to_csv('../match-data/match-v1/combined.csv')