In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'predicting-employee-attrition:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1736414%2F2838014%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240314%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240314T140249Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D769278942d3efa796e32c647bd5f450b5ac1cafc4aaa2c6f42fa11fdb6bcca0b2460534258aff098ac38b4c3f8b8b7f03a1dd9a93dc6580f8d8b3a02fd67447615f6ca0708d030fea5bcd63f073f0f70bb1d1ed5e702339676178231515598760360c70c5c6cefb96c38b71b7fff49a05f0df0873aefa9afaf9355b3f1d4dcfacf1f94b10dd1f8f267da5ed652c556fb6906132dcb777a5439b1a0eddbf90a694576982d48a67a9260af49d99c7a8154dde996f6f8d1129775739d8e8726bd8524c5d730100162af723aa7513bf3d8a8ccdf5dc61806d4538dfdf16c28a8891c04252c80f93c0c13d6cbda94504be69a0dada9ec1324592b5c73068008e72ab8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


## Problem Statement:
In recent years, attention has increasingly been paid to human resources (HR), since worker quality and skills represent a growth factor and a real competitive advantage for companies. After proving its mettle in sales and marketing, analytics and artificial intelligence is also becoming central to employee-related decisions within HR management. Organizational growth largely depends on staff retention. Losing employees frequently impacts the morale of the organization and hiring new employees is more expensive than retaining existing ones.

In [None]:
import pandas as pd
data = pd.read_csv('../input/predicting-employee-attrition/train_data.csv')
data.head()

### Data Cleaning:
* Remove all duplicate instances of one id
* Create label class
* Create 'TotalWorkingYears' column
* Convert salary to 'High','Low','Moderate'


In [None]:
m1 = ~data.duplicated(['Emp_ID'], keep='last')

In [None]:
final_d=data.loc[m1]

In [None]:
final_d[final_d['Emp_ID']==2]

In [None]:
import numpy as np
final_d.loc[:,'working'] = np.where(pd.isnull(final_d['LastWorkingDate']), 1, 0)

In [None]:
final_d.head()

In [None]:
final_d.loc[2,'Dateofjoining']

In [None]:
if pd.isna(final_d.loc[4,'LastWorkingDate']):
    print('Yes')

In [None]:
from dateutil.relativedelta import relativedelta
from datetime import datetime
def timesub(x,y):
    if pd.isnull(y):
        y = datetime.today().strftime('%Y-%m-%d')
    x_time = datetime.strptime(x, '%Y-%m-%d')
    y_time = datetime.strptime(y, '%Y-%m-%d')
    td=relativedelta(y_time,x_time)
    return td.years


In [None]:
final_d['TotalWorkingYears']=final_d.apply(lambda x: timesub(x.Dateofjoining, x.LastWorkingDate), axis=1)

In [None]:
final_d['TotalWorkingYears'].head()

In [None]:
final_d[['Total Business Value','Salary']].describe()

In [None]:
def sal(x):
    if x < 40000:
        return 'Low'
    elif x >= 40000 and x < 65000:
        return 'Medium'
    else:
        return 'High'

def busvalue(x):
    if x<=0:
        return 'Loss'
    else:
        return 'Profit'
final_d['SalaryCat']=final_d['Salary'].apply(sal)
final_d['Total Business Value']=final_d['Total Business Value'].apply(busvalue)
final_d.head()

Get a high level overview of all numerical variables' correlation with each other

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8), dpi=80)
corr=final_d.corr()
sns.heatmap(corr, annot=True,cmap="crest")
plt.show()

Designation should be categorical but the datatype is integer that's why has been plotted in the heatmap

In [None]:
print(final_d['Designation'].dtypes,final_d['Joining Designation'].dtypes)

Let's plot it against categorical variables too

In [None]:
final_d['Designation']=final_d['Designation'].apply(str)
final_d['Joining Designation']=final_d['Joining Designation'].apply(str)

In [None]:
plt.figure(figsize=(10,8))
ax=sns.countplot(x="Gender", hue="working", data=final_d,palette='crest')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r
        ax.text(left.get_x() + left.get_width()/2., height_l + 10, '{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width()/2., height_r + 10, '{0:.0%}'.format(height_r/total), ha="center")
plt.show()

The ratios in our data are the same. Gender isn't a strong determinant of an employee beeing attrited or not.

In [None]:
plt.figure(figsize=(10,8))
ax=sns.countplot(x="Education_Level", hue="working", data=final_d,palette='crest')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r
        ax.text(left.get_x() + left.get_width()/2., height_l + 10, '{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width()/2., height_r + 10, '{0:.0%}'.format(height_r/total), ha="center")
plt.show()

There isn't a lot of difference in the values for attrited and non attrited employees.   
Insight: Education isn't a big factor contributing to higher attrition

In [None]:
plt.figure(figsize=(10,8))
ax=sns.countplot(x="Total Business Value", hue="working", data=final_d,palette='crest')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r
        ax.text(left.get_x() + left.get_width()/2., height_l + 10, '{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width()/2., height_r + 10, '{0:.0%}'.format(height_r/total), ha="center")
plt.show()

* Employees who help grow the business/company are more likely to stay. Seeing positive changes because of oneself keeps one motivated to keep doing it
* Employees who've caused the company a loss, are largely likely to leave.

In [None]:
plt.figure(figsize=(10,8))
ax=sns.countplot(x="Designation", hue="working", data=final_d,palette='crest')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]

for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r
        ax.text(left.get_x() + left.get_width()/2., height_l + 10, '{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width()/2., height_r + 10, '{0:.0%}'.format(height_r/total), ha="center")
plt.show()

The attrited employees are significatnly higher for job designations 1 and 2 than others.

In [None]:
sns.catplot(x="Designation", y="Salary",  data=final_d)

It makes sense that '1' and '2' have lower salaries compararively. Higher salary is a usualy motivation to keep working in the same company   
The currency of this dataset hasn't been specified. But if it was, we could dig in a little more to compare these salaries to living cost of a single person and a family.

Working seems to be highly correlated to total working year

In [None]:
plt.figure(figsize=(6,6))
y=final_d['TotalWorkingYears']
sns.boxplot(y=y,x=final_d['working'],palette='crest')

* Employees in their early years of career tend to resign a lot more than employees who've worked in the company for >4 years

In [None]:
y=final_d['Quarterly Rating']
x=final_d['working']
sns.countplot(hue=y,x=final_d['working'],palette='crest')

Most employees with low quarterly rating have a higher chance of leaving the company

Salary annd designation don't seem strongly correlated with attrition. Let's plot them against each other

In [None]:
wor=final_d[final_d['working']==1]
nowor=final_d[final_d['working']==0]
sns.kdeplot(wor.Salary)
sns.kdeplot(nowor.Salary)
plt.legend(('Yes', 'No'))

In [None]:
#a=final_d[final_d['working']==1]['City']
#b=final_d[final_d['working']==0]['City']
a=dict(final_d.groupby(['City', 'working']).agg({'working':['count']}).apply(list))

In [None]:
#working/total = working/working+notworking for each city
listwork=[]
listnotwork=[]
for i in range(len(a[('working', 'count')])):
    if i%2==0:
        listnotwork.append(a[('working', 'count')][i])
    else:
        listwork.append(a[('working', 'count')][i])

#working+total
listratio=[a/(a+b) for a,b in zip(listwork,listnotwork)]

In [None]:
print(listwork)
print(listnotwork)

In [None]:
plt.plot(listratio,linestyle='--')
plt.ylim(0,1)

In [None]:
del listwork
del listnotwork
stddev= np.std(np.array(listratio))
mean=np.mean(np.array(listratio))
print(mean,stddev)

The standard deviation isn't very large, meaning an employee's being attrited doesn't depend on where they're from

In [None]:
final_d.head()