In [1]:
! pip install -q pandas numpy scikit-learn matplotlib seaborn


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# IMPORT

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
# import seaborn as sns
import matplotlib.pyplot as plt

# LOAD DATA

In [3]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,0,2008/9/30,Female,Service,No,2,3.0,3.8,0.16
1,1,2008/11/30,Male,Service,Yes,1,2.0,5.0,0.36
2,2,2008/3/10,Female,Product,Yes,2,,5.8,0.49
3,3,2008/11/3,Male,Service,Yes,1,1.0,2.6,0.20
4,4,2008/7/24,Female,Service,No,3,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
10103,10103,2008/7/4,Female,Product,No,3,6.0,5.9,0.51
10104,10104,2008/2/29,Male,Service,No,3,6.0,6.3,0.62
10105,10105,2008/1/25,Male,Service,No,3,5.0,5.7,0.47
10106,10106,2008/12/26,Male,Service,Yes,2,3.0,5.5,0.40


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           10108 non-null  int64  
 1   Date of Joining       10108 non-null  object 
 2   Gender                10108 non-null  object 
 3   Company Type          10108 non-null  object 
 4   WFH Setup Available   10108 non-null  object 
 5   Designation           10108 non-null  int64  
 6   Resource Allocation   9478 non-null   float64
 7   Mental Fatigue Score  9142 non-null   float64
 8   Burn Rate             9624 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 710.8+ KB


# EDA & Preprocessing

In [5]:
df.isnull().sum()

Employee ID               0
Date of Joining           0
Gender                    0
Company Type              0
WFH Setup Available       0
Designation               0
Resource Allocation     630
Mental Fatigue Score    966
Burn Rate               484
dtype: int64

In [6]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8233 entries, 0 to 10106
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           8233 non-null   int64  
 1   Date of Joining       8233 non-null   object 
 2   Gender                8233 non-null   object 
 3   Company Type          8233 non-null   object 
 4   WFH Setup Available   8233 non-null   object 
 5   Designation           8233 non-null   int64  
 6   Resource Allocation   8233 non-null   float64
 7   Mental Fatigue Score  8233 non-null   float64
 8   Burn Rate             8233 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 643.2+ KB


In [7]:
# datetime
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])

reference_date = df['Date of Joining'].max()
df['Days_with_company'] = (reference_date - df['Date of Joining']).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_with_company'] = (reference_date - df['Date of Joining']).dt.days


In [8]:
df_encoded = pd.get_dummies(df, columns=['Gender', 'Company Type', 'WFH Setup Available'], drop_first=True)

df_X = df_encoded[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company', 'Gender_Male',
        'Company Type_Service', 'WFH Setup Available_Yes'
    ]]
df_y = df_encoded['Burn Rate']

In [9]:
df_X

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Days_with_company,Gender_Male,Company Type_Service,WFH Setup Available_Yes
0,2,3.0,3.8,92,False,True,False
1,1,2.0,5.0,31,True,True,True
3,1,1.0,2.6,58,True,True,True
4,3,7.0,6.9,160,False,True,False
5,2,4.0,3.6,35,True,False,True
...,...,...,...,...,...,...,...
10102,4,8.0,6.2,105,True,True,False
10103,3,6.0,5.9,180,False,False,False
10104,3,6.0,6.3,306,True,True,False
10105,3,5.0,5.7,341,True,True,False


In [10]:
df_y

0        0.16
1        0.36
3        0.20
4        0.52
5        0.29
         ... 
10102    0.54
10103    0.51
10104    0.62
10105    0.47
10106    0.40
Name: Burn Rate, Length: 8233, dtype: float64

# Split train test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)