In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [51]:
pd.set_option('display.max_columns', 1000)

In [2]:
DATA = Path('data')
RAW  = DATA/'raw'
SUBMISSIONS = DATA/'submissions'

In [3]:
train      = pd.read_csv(RAW/'train_jqd04QH.csv', low_memory=False)
test       = pd.read_csv(RAW/'test_GYi4Gz5.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)

In [33]:
df = pd.concat([train, test])

In [34]:
#Function for label encoding the columns of a datafream
def label_encode_df(dataframe, cols):
    for col in cols:
        le = LabelEncoder()
        dataframe[col] = le.fit_transform(dataframe[col].astype(str))

In [35]:
def target_encode_mean(df_fit, df_transform, col, target):
    group_mean = pd.DataFrame(df_fit.groupby([col])[target].mean())
    group_mean.columns = [col+"_"+target+"_mean"]
    group_mean.reset_index(inplace=True)
    df_transform = df_transform.merge(group_mean, how="left", on=[col])
    return df_transform

In [46]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from catboost import CatBoostClassifier


In [38]:
train["fold"] = 0
i = 1
for tr,ts in KFold(n_splits=5, shuffle=True, random_state=5).split(train):
    train.loc[list(ts), "fold"]=i
    i=i+1

In [40]:
stack = pd.DataFrame()
stack["enrollee_id"] = train.enrollee_id
stack["fold"] = train.fold
stack["target"] = train.target

In [42]:
stack.head()

Unnamed: 0,enrollee_id,fold,target
0,23798,1,0
1,29166,3,0
2,46,3,0
3,18527,5,0
4,21751,3,1


In [43]:
model1 = {"model_name" : "CatBoost1", "n_estimators":540,"model_vars" :['city', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours','NA_type'],"cat_vars" :12, "seed" :30}
models = [model1]

In [44]:
models

[{'cat_vars': 12,
  'model_name': 'CatBoost1',
  'model_vars': ['city',
   'gender',
   'relevent_experience',
   'enrolled_university',
   'education_level',
   'major_discipline',
   'experience',
   'company_size',
   'company_type',
   'last_new_job',
   'training_hours',
   'NA_type'],
  'n_estimators': 540,
  'seed': 30}]

In [59]:
train.shape

(18359, 15)

In [54]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,fold
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0,1
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0,3
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0,3
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0,5
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1,3


In [47]:
for model in models:
    stack[model["model_name"]] = 0
    comb = pd.concat([train, test])
    comb.reset_index(inplace=True, drop=True)
    
    NA_cols = ["company_size", "company_type", "education_level", 
               "enrolled_university", "experience","gender",
               "last_new_job","major_discipline"]
    for col in NA_cols:
        comb["isna_"+col] = comb[col].isna().astype(int)
    comb["NA_type"] = ''
    for col in NA_cols:
        comb["NA_type"] = comb["NA_type"].astype(str) + "_" + \
                          comb[col].isna().astype(int).astype(str)
    label_encode_df(comb, model["model_vars"][0:model["cat_vars"]])
    for col in model["model_vars"][0:model["cat_vars"]]:
        comb[col] = comb[col].astype(str)
        
    for i in range(1,6):
        print("Running Model " + model["model_name"] + " for fold " + str(i))
        comb["dataset"] = "train"
        len_train = 18359
        comb.loc[len_train:, "dataset"] = "test"
        comb.loc[comb.fold==i, "dataset"] = "valid"
        
        y = comb.loc[comb.dataset=="train", "target"].values
        y_test = comb.loc[comb.dataset=="valid", "target"].values
        x = comb.loc[comb.dataset=="train", model["model_vars"]].values
        x_test = comb.loc[comb.dataset=="valid", model["model_vars"]].values
        cat_model = CatBoostClassifier(eval_metric="AUC", 
                                       n_estimators=model["n_estimators"], 
                                       random_state=model["seed"])
        cat_model.fit(x, y, cat_features=list(range(0, model["cat_vars"])), 
                      verbose=False)
        stack.loc[stack.fold==i, model["model_name"]] = cat_model.predict_proba(
                comb.loc[comb.dataset=="valid", model["model_vars"]].values)[:,1]


Running Model CatBoost1 for fold 1
Running Model CatBoost1 for fold 2
Running Model CatBoost1 for fold 3
Running Model CatBoost1 for fold 4
Running Model CatBoost1 for fold 5


In [58]:
x.shape

(14688, 12)

In [48]:
stack.head()

Unnamed: 0,enrollee_id,fold,target,CatBoost1
0,23798,1,0,0.066563
1,29166,3,0,0.074494
2,46,3,0,0.101926
3,18527,5,0,0.068228
4,21751,3,1,0.184686


In [52]:
comb.head()

Unnamed: 0,city,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,fold,gender,last_new_job,major_discipline,relevent_experience,target,training_hours,isna_company_size,isna_company_type,isna_education_level,isna_enrolled_university,isna_experience,isna_gender,isna_last_new_job,isna_major_discipline,NA_type,dataset
0,41,0.689,1,5,0,3,23798,13,1.0,1,0,5,0,0.0,8,0,0,0,0,0,0,0,0,0,train
1,112,0.923,7,1,0,3,29166,5,3.0,1,0,5,0,0.0,208,0,0,0,0,0,0,0,0,0,train
2,48,0.91,4,4,0,3,46,16,3.0,3,1,5,0,0.0,176,0,0,0,0,0,1,0,0,4,train
3,93,0.666,4,5,0,3,18527,5,5.0,1,0,5,0,0.0,125,0,0,0,0,0,0,0,0,0,valid
4,2,0.887,8,6,2,3,21751,18,3.0,3,1,5,1,1.0,229,1,1,0,0,0,1,0,0,86,train


In [61]:
stack_test = pd.DataFrame()
stack_test["enrollee_id"] = test.enrollee_id
for model in models:
    stack_test[model["model_name"]] = 0
    comb = pd.concat([train,test])
    comb.reset_index(inplace=True,drop=True)
    NA_cols = ["company_size", "company_type", "education_level",
               "enrolled_university", "experience", "gender",
               "last_new_job","major_discipline"]
    for col in NA_cols:
        comb["isna_"+col] = comb[col].isna().astype(int)
    comb["NA_type"] = ''
    for col in NA_cols:
        comb["NA_type"] = comb["NA_type"].astype(str) + "_" + \
                          comb[col].isna().astype(int).astype(str)
    label_encode_df(comb,model["model_vars"][0:model["cat_vars"]])
    for col in model["model_vars"][0:model["cat_vars"]]:
        comb[col]=comb[col].astype(str)
    print("Running Model " + model["model_name"] + " on the test data")
    comb["dataset"]="train"
    len_train=18359
    comb.loc[len_train:,"dataset"]="test"
    y=comb.loc[comb.dataset=="train","target"].values
    x=comb.loc[comb.dataset=="train",model["model_vars"]].values
    cat_model=CatBoostClassifier(eval_metric="AUC",n_estimators=model["n_estimators"],random_state=model["seed"])
    cat_model.fit(x,y,cat_features=list(range(0,model["cat_vars"])),verbose=False)
    stack_test[model["model_name"]]=cat_model.predict_proba(comb.loc[comb.dataset=="test",model["model_vars"]].values)[:,1]
    

Running Model CatBoost1 on the test data


In [62]:
stack_test.head()

Unnamed: 0,enrollee_id,CatBoost1
0,16548,0.387878
1,12036,0.070722
2,11061,0.4147
3,5032,0.073176
4,17599,0.084848


## LightGBM

In [63]:
stack["lgb_model"]=0
lgb_vars=['city', 'city_development_index', 'company_size', 'company_type',
       'education_level', 'enrolled_university', 'experience',
       'gender', 'last_new_job', 'major_discipline','relevent_experience',
       'training_hours',
       'city_target_mean',
        'enrolled_university_target_mean',
       'education_level_target_mean', 'major_discipline_target_mean',
       'experience_target_mean',
        'last_new_job_target_mean',
        'NA_type_target_mean']
cat_vars=['city', 'company_size', 'company_type',
       'education_level', 'enrolled_university',
       'gender', 'last_new_job', 'major_discipline', 'relevent_experience',      
        'experience','NA_type','training_hours']

In [67]:
for i in range(1,6):
    print("Running Model LGBM for fold"+str(i))
    comb=pd.concat([train,test])
    comb.reset_index(inplace=True,drop=True)
    NA_cols=["company_size","company_type","education_level","enrolled_university","experience","gender",
    "last_new_job","major_discipline"]
    for col in NA_cols:
        comb["isna_"+col]=comb[col].isna().astype(int)
    comb["NA_type"]=''
    for col in NA_cols:
        comb["NA_type"]=comb["NA_type"].astype(str)+"_"+comb[col].isna().astype(int).astype(str)
    comb["dataset"]="train"
    len_train=18359
    comb.loc[len_train:,"dataset"]="test"
    comb.loc[comb.fold==i,"dataset"]="valid"
    for col in cat_vars:
        comb=target_encode_mean(comb[comb.dataset=="train"],comb,col,"target")
    label_encode_df(comb,cat_vars)
    y=comb.loc[comb.dataset=="train","target"].values
    y_test=comb.loc[comb.dataset=="valid","target"].values
    x=comb.loc[comb.dataset=="train",lgb_vars].values
    x_test=comb.loc[comb.dataset=="valid",lgb_vars].values
    lgbtrain=lgb.Dataset(x,y,feature_name=lgb_vars,free_raw_data=True)
    lgbvalid=lgb.Dataset(x_test,y_test,feature_name=lgb_vars,free_raw_data=True)
    #LGB Model
    parameters = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting': 'gbdt',
        'num_leaves': 200,
        'feature_fraction': 0.65,
        'bagging_fraction': 0.85,
        #'bagging_freq': 20,
        'learning_rate': 0.1,
        'verbose': 1,
        'max_bin' :10,
        'min_gain_to_split' : 1,
        'seed': 0
    }
    lgb_model=lgb.train(parameters,lgbtrain,num_boost_round=20,)
    stack.loc[stack.fold==i,"lgb_model"]=lgb_model.predict(comb.loc[comb.dataset=="valid",lgb_vars].values)

Running Model LGBM for fold1
Running Model LGBM for fold2
Running Model LGBM for fold3
Running Model LGBM for fold4
Running Model LGBM for fold5


In [68]:
x.shape

(14688, 19)

In [69]:
comb.head()

Unnamed: 0,city,city_development_index,company_size,company_type,education_level,enrolled_university,enrollee_id,experience,fold,gender,last_new_job,major_discipline,relevent_experience,target,training_hours,isna_company_size,isna_company_type,isna_education_level,isna_enrolled_university,isna_experience,isna_gender,isna_last_new_job,isna_major_discipline,NA_type,dataset,city_target_mean,company_size_target_mean,company_type_target_mean,education_level_target_mean,enrolled_university_target_mean,gender_target_mean,last_new_job_target_mean,major_discipline_target_mean,relevent_experience_target_mean,experience_target_mean,NA_type_target_mean,training_hours_target_mean
0,41,0.689,1,5,0,3,23798,13,1.0,1,0,5,0,0.0,8,0,0,0,0,0,0,0,0,0,train,0.126582,0.101108,0.10642,0.140388,0.115416,0.122041,0.13552,0.136322,0.11692,0.167203,0.106223,0.12
1,112,0.923,7,1,0,3,29166,5,3.0,1,0,5,0,0.0,208,0,0,0,0,0,0,0,0,0,train,0.064815,0.105215,0.097171,0.140388,0.115416,0.122041,0.13552,0.136322,0.11692,0.125773,0.106223,0.072727
2,48,0.91,4,4,0,3,46,16,3.0,3,1,5,0,0.0,176,0,0,0,0,0,1,0,0,4,train,0.088806,0.09518,0.149199,0.140388,0.115416,,0.129089,0.136322,0.11692,0.124724,0.124191,0.108571
3,93,0.666,4,5,0,3,18527,5,5.0,1,0,5,0,0.0,125,0,0,0,0,0,0,0,0,0,valid,0.086022,0.09518,0.10642,0.140388,0.115416,0.122041,0.13552,0.136322,0.11692,0.125773,0.106223,0.125628
4,2,0.887,8,6,2,3,21751,18,3.0,3,1,5,1,1.0,229,1,1,0,0,0,1,0,0,86,train,0.156682,,,0.119559,0.115416,,0.129089,0.136322,0.168358,0.11414,0.246888,0.168831


In [70]:
stack.head()

Unnamed: 0,enrollee_id,fold,target,CatBoost1,lgb_model
0,23798,1,0,0.066563,0.131441
1,29166,3,0,0.074494,0.116256
2,46,3,0,0.101926,0.152743
3,18527,5,0,0.068228,0.132251
4,21751,3,1,0.184686,0.321977


In [71]:
stack_test["lgb_model"]=0
print("Running LightGBM on the test data")
comb=pd.concat([train,test])
comb.reset_index(inplace=True,drop=True)
NA_cols=["company_size","company_type","education_level","enrolled_university","experience","gender",
"last_new_job","major_discipline"]
for col in NA_cols:
    comb["isna_"+col]=comb[col].isna().astype(int)
comb["NA_type"]=''
for col in NA_cols:
    comb["NA_type"]=comb["NA_type"].astype(str)+"_"+comb[col].isna().astype(int).astype(str)
comb["dataset"]="train"
len_train=18359
comb.loc[len_train:,"dataset"]="test"
for col in cat_vars:
        comb=target_encode_mean(comb[comb.dataset=="train"],comb,col,"target")
label_encode_df(comb,cat_vars)
y=comb.loc[comb.dataset=="train","target"].values
y_test=comb.loc[comb.dataset=="valid","target"].values
x=comb.loc[comb.dataset=="train",lgb_vars].values
x_test=comb.loc[comb.dataset=="valid",lgb_vars].values
lgbtrain=lgb.Dataset(x,y,feature_name=lgb_vars,free_raw_data=True)
lgbvalid=lgb.Dataset(x_test,y_test,feature_name=lgb_vars,free_raw_data=True)
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'gbdt',
    'num_leaves': 200,
    'feature_fraction': 0.65,
    'bagging_fraction': 0.85,
    #'bagging_freq': 20,
    'learning_rate': 0.1,
    'verbose': 1,
    'max_bin' :10,
    'min_gain_to_split' : 1,
    'seed': 0
}
lgb_model=lgb.train(parameters,lgbtrain,num_boost_round=20)
stack_test["lgb_model"]=lgb_model.predict(comb.loc[comb.dataset=="test",lgb_vars].values)

Running LightGBM on the test data


In [73]:
stack_test.head()

Unnamed: 0,enrollee_id,CatBoost1,lgb_model
0,16548,0.387878,0.425284
1,12036,0.070722,0.119171
2,11061,0.4147,0.331963
3,5032,0.073176,0.125162
4,17599,0.084848,0.153965


In [76]:
stack.head()

Unnamed: 0,enrollee_id,fold,target,CatBoost1,lgb_model
0,23798,1,0,0.066563,0.131441
1,29166,3,0,0.074494,0.116256
2,46,3,0,0.101926,0.152743
3,18527,5,0,0.068228,0.132251
4,21751,3,1,0.184686,0.321977


In [75]:
from sklearn.linear_model import LogisticRegression

In [77]:
lr_model=LogisticRegression()
lr_model.fit(X=stack[["CatBoost1","lgb_model"]],y=stack.target)
stack_test["target"]=lr_model.predict_proba(X=stack_test[["CatBoost1","lgb_model"]])[:,1]
stack_test[["enrollee_id","target"]].to_csv(SUBMISSIONS/"sub_final.csv",index=False)

In [78]:
stack_test.head()

Unnamed: 0,enrollee_id,CatBoost1,lgb_model,target
0,16548,0.387878,0.425284,0.435489
1,12036,0.070722,0.119171,0.084919
2,11061,0.4147,0.331963,0.397773
3,5032,0.073176,0.125162,0.087005
4,17599,0.084848,0.153965,0.097648


In [80]:
lr_model.intercept_, lr_model.coef_

(array([-2.99235753]), array([[3.97035058, 2.80483563]]))