In [133]:
#import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading the data using pandas

In [134]:
data= pd.read_csv(r'HR-DT-PACKT.csv')
data.head(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157.0,3,0,1,0.0,sales,low
1,0.8,0.86,5,262.0,6,0,1,0.0,sales,medium
2,0.11,0.88,7,272.0,4,0,1,0.0,sales,medium
3,0.72,0.87,5,223.0,5,0,1,0.0,sales,low
4,0.37,0.52,2,159.0,3,0,1,0.0,sales,low


## Scrubbing the data

#### We will be exploring the data to see whether we have any missing values

In [135]:
data.isnull().values.any()

True

In [136]:
#It seems we have some missing values now let us explore what are the columns
#having missing values

data.isnull().any()

## it seems that we have missing values in average_montly_hours and promotion_last_5_years

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours      True
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years     True
sales                    False
salary                   False
dtype: bool

In [137]:
data[["average_montly_hours","promotion_last_5years"]].describe()

### From the below description it is observed that promotion_last_5years has value of only 0 and 1
### where as average_montly_hours has value inbetween 96 to 310

Unnamed: 0,average_montly_hours,promotion_last_5years
count,14986.0,14976.0
mean,201.059656,0.021301
std,49.930897,0.14439
min,96.0,0.0
25%,156.0,0.0
50%,200.0,0.0
75%,245.0,0.0
max,310.0,1.0


In [138]:
## No of missing Values present
data.isnull().sum()

satisfaction_level        0
last_evaluation           0
number_project            0
average_montly_hours     13
time_spend_company        0
Work_accident             0
left                      0
promotion_last_5years    23
sales                     0
salary                    0
dtype: int64

In [139]:
## Checking the datatype of the missing columns
data[["average_montly_hours","promotion_last_5years"]].dtypes

average_montly_hours     float64
promotion_last_5years    float64
dtype: object

### There are three ways to impute missing values:
    1. Droping the missing values rows
    2. Fill missing values with a test stastics
    3. Predict the missing values using ML algorithm

In [140]:
### Filling the missing value with the mean of the values
mean_value=data['average_montly_hours'].mean()
data['average_montly_hours']=data['average_montly_hours'].fillna(mean_value)

In [141]:
data['promotion_last_5years'].value_counts()

0.0    14657
1.0      319
Name: promotion_last_5years, dtype: int64

In [142]:
### Since it seems that the promotion_last_5years is a categorical field therefore
### we will fill the values with the 0 since its the most occuring number

data['promotion_last_5years']=data['promotion_last_5years'].fillna(value=0)

In [128]:
##checking for any missing values

data.isnull().any()

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool

### Renaming the columns

In [129]:
# We would want to rename some of the columns

data = data.rename(columns={
                        'promotion_last_5years': 'promotion',
                        'left' : 'churn'
                        })
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'churn',
       'promotion', 'sales', 'salary'],
      dtype='object')

### We would also like to move the churn columnn to the extreme right

In [131]:
column_churn = data['churn']
data.drop(labels=['churn'], axis=1,inplace = True)
data.insert(len(data.columns), 'churn', column_churn.values)

In [132]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion', 'sales', 'salary', 'churn'],
      dtype='object')