In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackerearth-employee-burnout-challenge/sample_submission.csv
/kaggle/input/hackerearth-employee-burnout-challenge/train.csv
/kaggle/input/hackerearth-employee-burnout-challenge/test.csv


In [2]:
data = pd.read_csv('../input/hackerearth-employee-burnout-challenge/train.csv')

In [3]:
data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [4]:
data.info()                             

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  float64
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1.6+ MB


In [5]:
rows = data.shape[0]
for col in data.columns:
    print('Column {} with missing value = {}%'.format(col,100*data[col].isna().sum()/rows))

Column Employee ID with missing value = 0.0%
Column Date of Joining with missing value = 0.0%
Column Gender with missing value = 0.0%
Column Company Type with missing value = 0.0%
Column WFH Setup Available with missing value = 0.0%
Column Designation with missing value = 0.0%
Column Resource Allocation with missing value = 6.07032967032967%
Column Mental Fatigue Score with missing value = 9.305494505494506%
Column Burn Rate with missing value = 4.940659340659341%


In [6]:
data.dropna(subset=['Burn Rate','Mental Fatigue Score'],axis=0,inplace=True)

In [7]:
for col in data.iloc[:,2:-2].columns:
    print('Unique values in {} = {}'.format(col,data[col].unique()))

Unique values in Gender = ['Female' 'Male']
Unique values in Company Type = ['Service' 'Product']
Unique values in WFH Setup Available = ['No' 'Yes']
Unique values in Designation = [2. 1. 3. 0. 4. 5.]
Unique values in Resource Allocation = [ 3.  2. nan  1.  7.  4.  6.  5.  8. 10.  9.]


In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
X = data.iloc[:,1:-2]
target = data.iloc[:,-2:]
X_new = pd.DataFrame(imputer.fit_transform(X))
X_new

Unnamed: 0,0,1,2,3,4,5
0,2008-09-30,Female,Service,No,2.0,3.0
1,2008-11-30,Male,Service,Yes,1.0,2.0
2,2008-03-10,Female,Product,Yes,2.0,4.0
3,2008-11-03,Male,Service,Yes,1.0,1.0
4,2008-07-24,Female,Service,No,3.0,7.0
...,...,...,...,...,...,...
19676,2008-12-15,Female,Product,Yes,1.0,3.0
19677,2008-05-27,Male,Product,No,3.0,7.0
19678,2008-01-19,Female,Product,Yes,3.0,6.0
19679,2008-01-10,Female,Service,No,2.0,5.0


In [9]:
X_new[0] =pd.to_datetime(X_new[0])
X_new[0] = X_new[0].apply(lambda x : X_new[0].max()-x)
X_new[0] = X_new[0].astype('int')/86400000000000

In [10]:
preprocessed_data = pd.get_dummies(X_new)

In [11]:
from lightgbm import LGBMRegressor
regressor = LGBMRegressor()
param_grid = {
    'n_estimators': [30, 128],
    'colsample_bytree': [0.3,0.7],
    'max_depth': [15,
                  25],
    'num_leaves': [50, 100],
    'reg_alpha': [1.1, 1.3],
    'reg_lambda': [1.1, 1.3],
    'min_split_gain': [ 0.4],
    'subsample': [0.7, 0.9],
    'subsample_freq': [20]
}


In [12]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1, 
    scoring='neg_mean_absolute_error',
    verbose=True
)
fitted_model = gs.fit(preprocessed_data, data.iloc[:,-1])

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [13]:
fitted_model.best_params_,fitted_model.best_score_

({'colsample_bytree': 0.7,
  'max_depth': 15,
  'min_split_gain': 0.4,
  'n_estimators': 128,
  'num_leaves': 50,
  'reg_alpha': 1.1,
  'reg_lambda': 1.3,
  'subsample': 0.9,
  'subsample_freq': 20},
 -0.08423903143533597)