In [1]:
# Data exploration and manipulation
import pandas as pd
import numpy as np
import missingno as msno
from scipy import stats

# Data Transformation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Graphics
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns

# Options
pd.set_option("display.max_columns",200)
%matplotlib inline

# Other librarys
import joblib
import pickle
import sys
from sklearn.pipeline import make_pipeline

# Model performance related libraries
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
sys.path.insert(1, '../../usr_lib/')
import utilities

In [3]:
data_path = '../../data/modulo2/examen/DataExam1/'

In [4]:
df = pd.read_csv(data_path+'HR_train.csv', sep=',', encoding='utf8')

In [5]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,area,salary,left
0,10557,0.81,0.75,5,101,5,0,0,sales,medium,0
1,12910,0.79,0.49,3,273,3,0,0,sales,medium,0
2,9657,0.58,0.49,2,107,3,0,0,technical,medium,0
3,2477,0.81,0.83,3,196,2,0,0,technical,low,0
4,2699,0.3,0.86,3,276,5,1,0,accounting,low,0


# Etiquetado de variables

In [6]:
var_c = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours',
        'time_spend_company', 'Work_accident', 'promotion_last_5years']
var_v = ['area', 'salary']

In [7]:
df = utilities.rename_variables(df, var_c, 'c_')
df = utilities.rename_variables(df, var_v, 'v_')
df.rename(columns = {'left':'tgt_left'}, inplace=True)

In [8]:
var = list(df.columns)
var.pop(var.index('tgt_left'))
var_v_n = list(df.filter(like='v_'))
var_c_n = list(df.filter(like='c_'))
tar = 'tgt_left'

In [9]:
from sklearn.model_selection import train_test_split
X = df[var].copy()
y = df[tar].copy()
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=234)

In [10]:
def normalize_categorical_variable(df, column, threshold = 0.05, label='cat', other_label='others', new_col=True):
    aux = pd.DataFrame(df[column].value_counts(1, dropna=False))
    aux[label] = aux.index
    aux[label] = aux[label].map(lambda x:x if aux.loc[x, column] > threshold else other_label)
    aux_dict = dict(zip(aux.index, aux[label]))
    if new_col:
        df[column + '_norm'] = df[column].map(aux_dict)
    else:
        df[column] = df[column].map(aux_dict)
    return(df, aux_dict)

def woe(df, column, tar, label='_woe'):
    df[column].fillna('Missings', inplace=True)
    aux = df[[tar, column]].pivot_table(index=column, columns=tar, aggfunc='size')
    woe = aux.apply(lambda x:x/sum(x)).apply(lambda x:np.log(x[1]/x[0]), axis=1)
    aux['WoE'] = woe
    aux_dict = dict(zip(aux.index, aux['WoE']))
    df[column + label] = df[column].map(aux_dict)
    return(df, aux_dict)

In [11]:
aux = Xt.copy()
aux['tgt_left'] = yt.copy()
woes = dict()
norms = dict()
for c in list(df.filter(like='v_')):
    aux, norms[c] = normalize_categorical_variable(aux, c)
    aux, woes[c] = woe(aux, c + '_norm', tar)

In [12]:
aux.head()

Unnamed: 0,id,c_satisfaction_level,c_last_evaluation,c_number_project,c_average_montly_hours,c_time_spend_company,c_work_accident,c_promotion_last_5years,v_area,v_salary,tgt_left,v_area_norm,v_area_norm_woe,v_salary_norm,v_salary_norm_woe
5695,10116,0.99,0.55,3,97,6,1,0,technical,medium,0,technical,0.137832,medium,-0.168074
8380,5844,0.49,0.92,4,229,2,0,0,technical,low,0,technical,0.137832,low,0.278603
3480,9418,0.74,0.51,5,198,3,0,0,hr,low,0,others,-0.126034,low,0.278603
10464,4592,0.89,0.9,3,240,3,0,0,sales,high,0,sales,0.032835,high,-1.429043
5134,11647,0.31,0.63,4,104,7,1,0,sales,medium,0,sales,0.032835,medium,-0.168074


In [13]:
def woe_norms(df, disc_cols, woe_cols, disc_dict, woe_dict, disc_label='_norm', woe_label='_woe'):
    for c in disc_cols:
        df[c+disc_label] = df[c].map(lambda x:disc_dict.get(c).get(x, 'Others'))
    for c in woe_cols:
        df[c+woe_label] = df[c].map(lambda x:woe_dict.get(c).get(x, 0))
    return(df.fillna(0))

In [14]:
woe_norms(Xt, var_v_n, var_v_n, norms, woes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,id,c_satisfaction_level,c_last_evaluation,c_number_project,c_average_montly_hours,c_time_spend_company,c_work_accident,c_promotion_last_5years,v_area,v_salary,v_area_norm,v_salary_norm,v_area_woe,v_salary_woe
5695,10116,0.99,0.55,3,97,6,1,0,technical,medium,technical,medium,0.137832,-0.168074
8380,5844,0.49,0.92,4,229,2,0,0,technical,low,technical,low,0.137832,0.278603
3480,9418,0.74,0.51,5,198,3,0,0,hr,low,others,low,0.000000,0.278603
10464,4592,0.89,0.90,3,240,3,0,0,sales,high,sales,high,0.032835,-1.429043
5134,11647,0.31,0.63,4,104,7,1,0,sales,medium,sales,medium,0.032835,-0.168074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635,254,0.60,0.92,2,258,5,0,0,sales,low,sales,low,0.032835,0.278603
4959,1612,0.43,0.55,4,134,3,0,0,marketing,low,marketing,low,0.015902,0.278603
6841,11513,0.98,0.49,3,199,10,0,0,technical,medium,technical,medium,0.137832,-0.168074
8516,4034,0.26,0.81,5,139,4,0,0,product_mng,medium,product_mng,medium,-0.133397,-0.168074


In [15]:
Xt = woe_norms(Xt, var_v_n, var_v_n, norms, woes)
Xv = woe_norms(Xv, var_v_n, var_v_n, norms, woes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [16]:
predictors = var_c_n + [c for c in Xt.columns if 'woe' in c]
predictors

['c_satisfaction_level',
 'c_last_evaluation',
 'c_number_project',
 'c_average_montly_hours',
 'c_time_spend_company',
 'c_work_accident',
 'c_promotion_last_5years',
 'v_area_woe',
 'v_salary_woe']

In [17]:
Xt

Unnamed: 0,id,c_satisfaction_level,c_last_evaluation,c_number_project,c_average_montly_hours,c_time_spend_company,c_work_accident,c_promotion_last_5years,v_area,v_salary,v_area_norm,v_salary_norm,v_area_woe,v_salary_woe
5695,10116,0.99,0.55,3,97,6,1,0,technical,medium,technical,medium,0.137832,-0.168074
8380,5844,0.49,0.92,4,229,2,0,0,technical,low,technical,low,0.137832,0.278603
3480,9418,0.74,0.51,5,198,3,0,0,hr,low,others,low,0.000000,0.278603
10464,4592,0.89,0.90,3,240,3,0,0,sales,high,sales,high,0.032835,-1.429043
5134,11647,0.31,0.63,4,104,7,1,0,sales,medium,sales,medium,0.032835,-0.168074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635,254,0.60,0.92,2,258,5,0,0,sales,low,sales,low,0.032835,0.278603
4959,1612,0.43,0.55,4,134,3,0,0,marketing,low,marketing,low,0.015902,0.278603
6841,11513,0.98,0.49,3,199,10,0,0,technical,medium,technical,medium,0.137832,-0.168074
8516,4034,0.26,0.81,5,139,4,0,0,product_mng,medium,product_mng,medium,-0.133397,-0.168074


In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import make_pipeline

In [19]:
pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False),
                        StandardScaler(with_mean=False),
                        LinearDiscriminantAnalysis(),
                        verbose=3)

In [20]:
Xt = Xt[predictors]
Xv = Xv[predictors]

In [21]:
pipeline.fit(Xt, yt)

[Pipeline] ..... (step 1 of 3) Processing onehotencoder, total=   0.0s
[Pipeline] .... (step 2 of 3) Processing standardscaler, total=   0.1s
[Pipeline]  (step 3 of 3) Processing lineardiscriminantanalysis, total=   0.8s


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('lineardiscriminantanalysis', LinearDiscriminantAnalysis())],
         verbose=3)

In [22]:
precision_score(yt, pipeline.predict(Xt))

0.903169014084507

In [23]:
recall_score(yt, pipeline.predict(Xt))

0.8902386117136659

In [24]:
precision_score(yv, pipeline.predict(Xv))

0.9023569023569024

In [25]:
recall_score(yv, pipeline.predict(Xv))

0.9209621993127147

In [26]:
test = pd.read_csv(data_path + 'HR_test.csv', sep=',')

In [27]:
test = utilities.rename_variables(test, var_c, 'c_')
test = utilities.rename_variables(test, var_v, 'v_')

In [28]:
test = woe_norms(test, var_v_n, var_v_n, norms, woes)

In [30]:
test['left'] = pipeline.predict(test[predictors])

In [31]:
test[['id', 'left']].to_csv('PicoLaraAlbertoIsaac_HR.csv', index=False)