# I) Imports and data

## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Import statements required for Plotly 
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

## Data processing

In [2]:
data = pd.read_csv('data/data.csv')
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [6]:
# Dropping columns
data = data.drop(["EmployeeCount", "EmployeeNumber", "Over18", "RelationshipSatisfaction", "StandardHours"], axis = 1)

# Formatting columns

data['Attrition'] = data['Attrition'].replace(["Yes", "No"], [1, 0])
data['Gender'] = data['Gender'].replace(["Male", "Female"], [0, 1])
data['OverTime'] = data['OverTime'].replace(["Yes", "No"], [1, 0])
data

# 1-Hot Encoding

size = len(data)

travel_frequently = []
travel_rarely = []
non_travel = []

research = []
sales = []
human = []

life_sciences = []
medical = []
marketing = []
technical = []
other = []

sales_executive = []
research_scientist = []
laboratory_technician = []
manufacturing_director = []
healthcare_representative = []

married = []
single = []
divorced = []

for index in range(size):
    travel_status = data["BusinessTravel"][index]
    department_status = data["Department"][index]
    education_status = data["EducationField"][index]
    jobRole_status = data["JobRole"][index]
    marital_status = data["MaritalStatus"][index]
    
    if travel_status == "Travel_Frequently":
        travel_frequently.append(1)
        travel_rarely.append(0)
        non_travel.append(0)
    elif travel_status == "Travel_Rarely":
        travel_frequently.append(0)
        travel_rarely.append(1)
        non_travel.append(0)
    else:
        travel_frequently.append(0)
        travel_rarely.append(0)
        non_travel.append(1)
        
    if department_status == "Research & Development":
        research.append(1)
        sales.append(0)
        human.append(0)
    elif department_status == "Sales":
        research.append(0)
        sales.append(1)
        human.append(0)
    else:
        research.append(0)
        sales.append(0)
        human.append(1)
        
    if education_status == "Life Sciences":
        life_sciences.append(1)
        medical.append(0)
        marketing.append(0)
        technical.append(0)
        other.append(0)
    elif education_status == "Medical":
        life_sciences.append(0)
        medical.append(1)
        marketing.append(0)
        technical.append(0)
        other.append(0)
    elif education_status == "Marketing":
        life_sciences.append(0)
        medical.append(0)
        marketing.append(1)
        technical.append(0)
        other.append(0)
    elif education_status == "Technical Degree":
        life_sciences.append(0)
        medical.append(0)
        marketing.append(0)
        technical.append(1)
        other.append(0)
    else:
        life_sciences.append(0)
        medical.append(0)
        marketing.append(0)
        technical.append(0)
        other.append(1)
    
    if jobRole_status == "Sales Executive":
        sales_executive.append(1)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Research Scientist":
        sales_executive.append(0)
        research_scientist.append(1)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Laboratory Technician":
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(1)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Manufacturing Director":
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(1)
        healthcare_representative.append(0)
    else:
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(1)
        
    if marital_status == "Married":
        married.append(1)
        single.append(0)
        divorced.append(0)
    elif marital_status == "Single":
        married.append(0)
        single.append(1)
        divorced.append(0)
    else:
        married.append(0)
        single.append(0)
        divorced.append(1)
        
        

business_index = data.columns.get_loc("BusinessTravel")
data = data.drop("BusinessTravel", axis = 1)
data.insert(business_index, "Non_Travel", non_travel)
data.insert(business_index, "Travel_Rarely", travel_rarely)
data.insert(business_index, "Travel_Frequently", travel_frequently)

department_index = data.columns.get_loc("Department")
data = data.drop("Department", axis = 1)
data.insert(department_index, "Human Resources", human)
data.insert(department_index, "Sales", sales)
data.insert(department_index, "Research & Development", research)

education_index = data.columns.get_loc("EducationField")
data = data.drop("EducationField", axis = 1)
data.insert(education_index, "EducationOther", other)
data.insert(education_index, "Technical Degree", technical)
data.insert(education_index, "Marketing", marketing)
data.insert(education_index, "Medical", medical)
data.insert(education_index, "Life Sciences", life_sciences)

jobRole_index = data.columns.get_loc("JobRole")
data = data.drop("JobRole", axis = 1)
data.insert(jobRole_index, "Healthcare Representative", healthcare_representative)
data.insert(jobRole_index, "Manufacturing Director", manufacturing_director)
data.insert(jobRole_index, "Laboratory Technician", laboratory_technician)
data.insert(jobRole_index, "Research Scientist", research_scientist)
data.insert(jobRole_index, "Sales Executive", sales_executive)

marital_index = data.columns.get_loc("MaritalStatus")
data = data.drop("MaritalStatus", axis = 1)
data.insert(marital_index, "Divorced", divorced)
data.insert(marital_index, "Single", single)
data.insert(marital_index, "Married", married)

TypeError: list indices must be integers or slices, not str

In [None]:
train, test = train_test_split(data, test_size=0.2)

In [None]:
train

In [None]:
train.to_csv("data/train.csv", index=False)
test.to_csv("data/test.csv", index=False)

#### Checking for null values

In [None]:
display(data.isnull().any())

In [None]:
for col in test.columns:
    print(f"{col} : {test[col].unique()}")

In [None]:
f, axes = plt.subplots(3, 2, figsize=(10, 8))
plt.tight_layout()
sns.kdeplot(test["Age"], ax=axes[0,0])
sns.kdeplot(test["DistanceFromHome"], ax=axes[0,1])
sns.kdeplot(test["HourlyRate"], ax=axes[1,0])
sns.kdeplot(test["DailyRate"], ax=axes[1,1])
sns.kdeplot(test["MonthlyRate"], ax=axes[2,0])
sns.kdeplot(test["MonthlyIncome"], ax=axes[2,1])
plt.show()

#### Distribution of Attrition values :

In [7]:
# creating a list of only numerical values
numerical = [u'Attrition', u'Age', u'DailyRate', u'DistanceFromHome', 
             u'Education', u'EnvironmentSatisfaction',
             u'HourlyRate', u'JobInvolvement', u'JobLevel', u'JobSatisfaction',
             u'MonthlyIncome', u'MonthlyRate', u'NumCompaniesWorked',
             u'PercentSalaryHike', u'PerformanceRating',
             u'StockOptionLevel', u'TotalWorkingYears',
             u'TrainingTimesLastYear', u'WorkLifeBalance', u'YearsAtCompany',
             u'YearsInCurrentRole', u'YearsSinceLastPromotion',u'YearsWithCurrManager']
data = [
    go.Heatmap(
        z= data[numerical].astype(float).corr().values, # Generating the Pearson correlation
        x=data[numerical].columns.values,
        y=data[numerical].columns.values,
        colorscale='Viridis',
        reversescale = False,
#         text = True ,
        opacity = 1.0
        
    )
]


layout = go.Layout(
    title='Pearson Correlation of numerical features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
    
)


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

TypeError: list indices must be integers or slices, not list