In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report

In [17]:
file_model = 'hr_train'
file_score = 'hr_validation'
ohe_cols = ['sales', 'salary']

df = pd.read_csv(file_model + '.csv', header=0)
ID = df.iloc[:,0]
y = df.iloc[:,1]
X = df.iloc[:,2:]

pd.DataFrame(df).head()

Unnamed: 0,index,left,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,10438,0,0.53,0.52,2,135,4,0,0,technical,medium
1,9236,0,0.77,0.53,5,256,3,0,0,accounting,medium
2,818,1,0.89,0.79,3,149,2,0,0,support,medium
3,11503,0,0.64,0.63,3,156,6,1,0,support,low
4,11721,0,0.98,0.74,4,151,3,0,0,sales,medium


In [18]:
# preprocessing-1: one-hot encoding
X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_cols)
X_ohe = X_ohe.dropna(axis=1, how='all')
X_ohe_columns = X_ohe.columns.values

# preprocessing-2: null imputation
imp = SimpleImputer()
imp.fit(X_ohe)
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
print(X_ohe.shape)

# preprocessing-3: feature selection
selector = RFECV(estimator=RandomForestClassifier(n_estimators=100,random_state=0), step=0.05)
selector.fit(X_ohe, y)
X_ohe_selected = selector.transform(X_ohe)
X_ohe_selected = pd.DataFrame(X_ohe_selected, columns=X_ohe_columns[selector.support_])
print(X_ohe_selected.shape)
X_ohe_selected.head()

# preprocessing-4: preprocessing of a score data along with a model dataset
#if len(file_score)>0:
# load score data
dfs = pd.read_csv('./data/'+ file_score + '.csv', header=0)
IDs = dfs.iloc[:,0]
Xs = dfs.iloc[:,2:]
Xs_ohe = pd.get_dummies(Xs, dummy_na=True, columns=ohe_cols)
cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)

# consistent with columns set
Xs_exp = pd.concat([cols_m, Xs_ohe])
Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))] = \
    Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))].fillna(0, axis=1)
Xs_exp = Xs_exp.drop(list(set(Xs_ohe.columns.values)-set(X_ohe_columns)), axis=1)

# re-order the score data columns
Xs_exp = Xs_exp.reindex_axis(X_ohe_columns, axis=1)
Xs_exp = pd.DataFrame(imp.transform(Xs_exp), columns=X_ohe_columns)
Xs_exp_selected = Xs_exp.loc[:, X_ohe_columns[selector.support_]]


(10499, 22)




(10499, 10)




In [19]:
pipelines = {
    'knn':
        Pipeline([('scl',StandardScaler()),
                  ('est',KNeighborsClassifier())]),
    'logistic':
        Pipeline([('scl',StandardScaler()),
                  ('est',LogisticRegression(random_state=1))]),
    'rsvc':
        Pipeline([('scl',StandardScaler()),
                  ('est',SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=1, probability=True))]),
    'gb':
        Pipeline([('scl',StandardScaler()),
                  ('est',GradientBoostingClassifier(random_state=1))]),
    'mlp':
        Pipeline([('scl',StandardScaler()),
                  ('est',MLPClassifier(hidden_layer_sizes=(3,3),
                                       max_iter=1000,
                                       random_state=1))])
}


# fit & evaluation
scores = {}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(X_ohe_selected, y)
#    scores[(pipe_name,'train')] = cross_val_score(pipeline, X_ohe_selected, y, scoring='roc_auc', cv=5)
    scores[(pipe_name,'train_acc')] = accuracy_score(y, pipeline.predict(X_ohe_selected))
    scores[(pipe_name,'train_f1')] = f1_score(y, pipeline.predict(X_ohe_selected))
#    scores[(pipe_name,'train_precision')] = precision_score(y, pipeline.predict(X_ohe_selected))
#    scores[(pipe_name,'test_smt_f1')] = f1_score(y_test, pipeline.predict(X_test))
    
#print(classification_report(y_train, pipeline.predict(X_train)))
#print(classification_report(y_test, pipeline.predict(X_test)))
display(pd.Series(scores).unstack())


for pipe_name, pipeline in pipelines.items():
    results = cross_val_score(pipeline, X_ohe_selected, y, scoring='roc_auc', cv=5)
    print(pipe_name)
    print('cv score:', np.average(results), '+-', np.std(results))




Unnamed: 0,train_acc,train_f1
gb,0.977141,0.951593
knn,0.969426,0.937194
logistic,0.785789,0.434498
mlp,0.956186,0.90952
rsvc,0.952186,0.903942


knn
cv score: 0.9731249141029954 +- 0.004343477677510745
logistic
cv score: 0.815660789011113 +- 0.007185611139875757




rsvc
cv score: 0.9765949241910162 +- 0.004250485387867907
gb
cv score: 0.9878095582508525 +- 0.0025311891874976603
mlp
cv score: 0.9703749146458092 +- 0.004244116453006169


In [20]:
model_name = 'BESTFIT_kari'
clf = Pipeline([('scl',StandardScaler()),
                  ('est',GradientBoostingClassifier(random_state=1))])

confirm_score={}
clf.fit(X_ohe_selected, y)
#confirm_score[(pipe_name,'train')] = cross_val_score(pipeline, X_ohe_selected, y, scoring='roc_auc', cv=5)
confirm_score[(pipe_name,'train_acc')] = accuracy_score(y, pipeline.predict(X_ohe_selected))
confirm_score[(pipe_name,'train_f1')] = f1_score(y, pipeline.predict(X_ohe_selected))
#confirm_score[(pipe_name,'train_precision')] = precision_score(y, pipeline.predict(X_ohe_selected))
display(pd.Series(confirm_score).unstack())

joblib.dump(clf, './model/'+ model_name + '.pkl')

Unnamed: 0,train_acc,train_f1
mlp,0.956186,0.90952


['./model/BESTFIT_kari.pkl']

In [22]:
score_train = pd.DataFrame(clf.predict_proba(X_ohe_selected)[:,])
train_confirm = pd.concat([ID, score_train, y])
display(train_confirm)

Unnamed: 0,0,1
0,10438.0,
1,9236.0,
2,818.0,
3,11503.0,
4,11721.0,
5,5276.0,
6,6863.0,
7,13462.0,
8,11975.0,
9,11461.0,


In [24]:
score = pd.DataFrame(clf.predict_proba(Xs_exp_selected)[:,1])
#display(score)
display(pd.concat([IDs, score], axis=1))

final = pd.concat([IDs, score], axis=1)
final.to_csv('aijc3104_kari.csv', header=True, index=False)

Unnamed: 0,index,0
0,10438,0.068076
1,9236,0.016306
2,818,0.012831
3,11503,0.023180
4,11721,0.003729
5,5276,0.014804
6,6863,0.014177
7,13462,0.032171
8,11975,0.006745
9,11461,0.012239
