In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.isna().sum()

In [None]:
pip install scikit-learn==0.20.4

In [None]:
def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn

from lightgbm import LGBMClassifier
import lightgbm as lgbm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [20]:
original_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Renaming the 'EmployeeNumber' column to 'id'
original_data = original_data.rename(columns={"EmployeeNumber": "id"})

# Converting 'Attrition' values from 'Yes' to integer 1 and other values to integer 0
original_data["Attrition"] = (original_data["Attrition"] == "Yes").astype(int)

# rearrange columns so that they are in the same order as in train
original_data = original_data[train.columns.tolist()]

original_data["source"] = "original"
train["source"] = "synthetic"
test["source"] = "synthetic"

# combining the datasets
train = pd.concat([train, original_data])
train.reset_index(drop=True, inplace=True)

In [21]:
train.columns

Index(['id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'source'],
      dtype='object')

In [23]:
features = list(train.columns)
features.remove("id")
features.remove("Attrition")

target = "Attrition"

In [24]:
class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode

    def fit(self, X, y=None):
        return self  # not relevant here

    def transform(self, X):
        """
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        """
        output = X.copy()

        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)

        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [25]:
columns_with_strings_as_values = list((train.dtypes[train.dtypes == "object"]).index)
columns_with_strings_as_values

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime',
 'source']

In [26]:
label_encoder = MultiColumnLabelEncoder(columns=columns_with_strings_as_values)
train = label_encoder.fit_transform(train)
test = label_encoder.transform(test)

In [33]:
clfs = []
scores = []

kf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
for train_index, val_index in kf.split(train, y=train["Attrition"]):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]

    clf = LGBMClassifier(
        n_estimators=150,
        categorical_feature=[1, 3, 6, 9, 13, 15, 19, 20, 33],
        metric="auc",
    )
    clf.fit(X_train.values, y_train, eval_set=[(X_val, y_val)])
    preds = clf.predict_proba(X_val.values)

    clfs.append(clf)
    scores.append(roc_auc_score(y_val, preds[:, 1]))

print("Mean score across all folds:", np.mean(scores))

[LightGBM] [Info] Number of positive: 393, number of negative: 2439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1152
[LightGBM] [Info] Number of data points in the train set: 2832, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138771 -> initscore=-1.825534
[LightGBM] [Info] Start training from score -1.825534
[LightGBM] [Info] Number of positive: 393, number of negative: 2439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1150
[LightGBM] [Info] Number of data points in the train set: 2832, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138771 -> initscore=-1.825534
[LightGBM]

In [34]:
for i in clf.feature_importances_.argsort()[::-1]:
    print(features[i], clf.feature_importances_[i]/clf.feature_importances_.sum())

MonthlyIncome 0.09866666666666667
DailyRate 0.08177777777777778
MonthlyRate 0.0791111111111111
HourlyRate 0.074
Age 0.06844444444444445
DistanceFromHome 0.06333333333333334
PercentSalaryHike 0.03977777777777778
YearsAtCompany 0.034666666666666665
RelationshipSatisfaction 0.03311111111111111
EnvironmentSatisfaction 0.03244444444444444
TotalWorkingYears 0.03222222222222222
NumCompaniesWorked 0.03088888888888889
TrainingTimesLastYear 0.029777777777777778
JobSatisfaction 0.029111111111111112
JobInvolvement 0.028444444444444446
YearsSinceLastPromotion 0.026
YearsWithCurrManager 0.026
OverTime 0.025777777777777778
BusinessTravel 0.023777777777777776
StockOptionLevel 0.021333333333333333
YearsInCurrentRole 0.021333333333333333
WorkLifeBalance 0.01911111111111111
Education 0.014222222222222223
JobRole 0.012222222222222223
MaritalStatus 0.011777777777777778
source 0.011777777777777778
Department 0.011111111111111112
Gender 0.010444444444444444
JobLevel 0.004888888888888889
EducationField 0.0044