# I) Imports and data

## Imports

In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Import statements required for Plotly 
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

## Data processing

In [2]:
df = pd.read_csv('data/data.csv')
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [3]:
# Dropping columns
df = df.drop(["EmployeeCount", "EmployeeNumber", "Over18", "RelationshipSatisfaction", "StandardHours"], axis = 1)

# Formatting columns

df['Attrition'] = df['Attrition'].replace(["Yes", "No"], [1, 0])
df['Gender'] = df['Gender'].replace(["Male", "Female"], [0, 1])
df['OverTime'] = df['OverTime'].replace(["Yes", "No"], [1, 0])
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,1,...,11,3,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,0,...,23,4,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,0,...,15,3,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,1,...,11,3,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,0,...,12,3,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel_Frequently,884,Research & Development,23,2,Medical,3,0,...,17,3,1,17,3,3,5,2,0,3
1466,39,0,Travel_Rarely,613,Research & Development,6,1,Medical,4,0,...,15,3,1,9,5,3,7,7,1,7
1467,27,0,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,0,...,20,4,1,6,0,3,6,2,0,3
1468,49,0,Travel_Frequently,1023,Sales,2,3,Medical,4,0,...,14,3,0,17,3,2,9,6,0,8


#### Distribution of Attrition values :

In [None]:
# creating a list of only numerical values
numerical = [u'Attrition', u'Age', u'DailyRate', u'DistanceFromHome', 
             u'Education', u'EnvironmentSatisfaction',
             u'HourlyRate', u'JobInvolvement', u'JobLevel', u'JobSatisfaction',
             u'MonthlyIncome', u'MonthlyRate', u'NumCompaniesWorked',
             u'PercentSalaryHike', u'PerformanceRating',
             u'StockOptionLevel', u'TotalWorkingYears',
             u'TrainingTimesLastYear', u'WorkLifeBalance', u'YearsAtCompany',
             u'YearsInCurrentRole', u'YearsSinceLastPromotion',u'YearsWithCurrManager']
heatmap = [
    go.Heatmap(
        z= df[numerical].astype(float).corr().values, # Generating the Pearson correlation
        x=df[numerical].columns.values,
        y=df[numerical].columns.values,
        colorscale='Viridis',
        reversescale = False,
#         text = True ,
        opacity = 1.0
        
    )
]


layout = go.Layout(
    title='Pearson Correlation of numerical features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
    
)


fig = go.Figure(data=heatmap, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

In [4]:


# 1-Hot Encoding

size = len(df)

print(size)
print(df)

travel_frequently = []
travel_rarely = []
non_travel = []

research = []
sales = []
human = []

life_sciences = []
medical = []
marketing = []
technical = []
other = []

sales_executive = []
research_scientist = []
laboratory_technician = []
manufacturing_director = []
healthcare_representative = []

married = []
single = []
divorced = []

for index in range(size):
    travel_status = df["BusinessTravel"][index]
    department_status = df["Department"][index]
    education_status = df["EducationField"][index]
    jobRole_status = df["JobRole"][index]
    marital_status = df["MaritalStatus"][index]
    
    if travel_status == "Travel_Frequently":
        travel_frequently.append(1)
        travel_rarely.append(0)
        non_travel.append(0)
    elif travel_status == "Travel_Rarely":
        travel_frequently.append(0)
        travel_rarely.append(1)
        non_travel.append(0)
    else:
        travel_frequently.append(0)
        travel_rarely.append(0)
        non_travel.append(1)
        
    if department_status == "Research & Development":
        research.append(1)
        sales.append(0)
        human.append(0)
    elif department_status == "Sales":
        research.append(0)
        sales.append(1)
        human.append(0)
    else:
        research.append(0)
        sales.append(0)
        human.append(1)
        
    if education_status == "Life Sciences":
        life_sciences.append(1)
        medical.append(0)
        marketing.append(0)
        technical.append(0)
        other.append(0)
    elif education_status == "Medical":
        life_sciences.append(0)
        medical.append(1)
        marketing.append(0)
        technical.append(0)
        other.append(0)
    elif education_status == "Marketing":
        life_sciences.append(0)
        medical.append(0)
        marketing.append(1)
        technical.append(0)
        other.append(0)
    elif education_status == "Technical Degree":
        life_sciences.append(0)
        medical.append(0)
        marketing.append(0)
        technical.append(1)
        other.append(0)
    else:
        life_sciences.append(0)
        medical.append(0)
        marketing.append(0)
        technical.append(0)
        other.append(1)
    
    if jobRole_status == "Sales Executive":
        sales_executive.append(1)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Research Scientist":
        sales_executive.append(0)
        research_scientist.append(1)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Laboratory Technician":
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(1)
        manufacturing_director.append(0)
        healthcare_representative.append(0)
    elif jobRole_status == "Manufacturing Director":
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(1)
        healthcare_representative.append(0)
    else:
        sales_executive.append(0)
        research_scientist.append(0)
        laboratory_technician.append(0)
        manufacturing_director.append(0)
        healthcare_representative.append(1)
        
    if marital_status == "Married":
        married.append(1)
        single.append(0)
        divorced.append(0)
    elif marital_status == "Single":
        married.append(0)
        single.append(1)
        divorced.append(0)
    else:
        married.append(0)
        single.append(0)
        divorced.append(1)
        
        

business_index = df.columns.get_loc("BusinessTravel")
df = df.drop("BusinessTravel", axis = 1)
df.insert(business_index, "Non_Travel", non_travel)
df.insert(business_index, "Travel_Rarely", travel_rarely)
df.insert(business_index, "Travel_Frequently", travel_frequently)

department_index = df.columns.get_loc("Department")
df = df.drop("Department", axis = 1)
df.insert(department_index, "Human Resources", human)
df.insert(department_index, "Sales", sales)
df.insert(department_index, "Research & Development", research)

education_index = df.columns.get_loc("EducationField")
df = df.drop("EducationField", axis = 1)
df.insert(education_index, "EducationOther", other)
df.insert(education_index, "Technical Degree", technical)
df.insert(education_index, "Marketing", marketing)
df.insert(education_index, "Medical", medical)
df.insert(education_index, "Life Sciences", life_sciences)

jobRole_index = df.columns.get_loc("JobRole")
df = df.drop("JobRole", axis = 1)
df.insert(jobRole_index, "Healthcare Representative", healthcare_representative)
df.insert(jobRole_index, "Manufacturing Director", manufacturing_director)
df.insert(jobRole_index, "Laboratory Technician", laboratory_technician)
df.insert(jobRole_index, "Research Scientist", research_scientist)
df.insert(jobRole_index, "Sales Executive", sales_executive)

marital_index = df.columns.get_loc("MaritalStatus")
df = df.drop("MaritalStatus", axis = 1)
df.insert(marital_index, "Divorced", divorced)
df.insert(marital_index, "Single", single)
df.insert(marital_index, "Married", married)

1470
      Age  Attrition     BusinessTravel  DailyRate              Department  \
0      41          1      Travel_Rarely       1102                   Sales   
1      49          0  Travel_Frequently        279  Research & Development   
2      37          1      Travel_Rarely       1373  Research & Development   
3      33          0  Travel_Frequently       1392  Research & Development   
4      27          0      Travel_Rarely        591  Research & Development   
...   ...        ...                ...        ...                     ...   
1465   36          0  Travel_Frequently        884  Research & Development   
1466   39          0      Travel_Rarely        613  Research & Development   
1467   27          0      Travel_Rarely        155  Research & Development   
1468   49          0  Travel_Frequently       1023                   Sales   
1469   34          0      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  Environm

In [5]:
train, test = train_test_split(df, test_size=0.2)

In [6]:
train

Unnamed: 0,Age,Attrition,Travel_Frequently,Travel_Rarely,Non_Travel,DailyRate,Research & Development,Sales,Human Resources,DistanceFromHome,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
172,36,0,1,0,0,1480,1,0,0,3,...,12,3,0,13,3,2,8,7,7,2
1179,34,0,0,1,0,1130,1,0,0,3,...,12,3,1,11,2,3,11,8,7,9
1395,31,1,1,0,0,754,0,1,0,26,...,11,3,0,10,4,3,10,7,0,8
1360,31,0,0,1,0,471,1,0,0,4,...,12,3,1,4,0,2,2,2,2,2
1409,40,0,1,0,0,692,1,0,0,11,...,11,3,1,10,2,4,10,9,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1069,28,0,0,1,0,1423,1,0,0,1,...,14,3,1,1,2,1,1,0,0,0
1353,34,1,0,0,1,967,1,0,0,16,...,23,4,1,5,2,3,5,2,3,0
1178,20,0,0,1,0,1141,0,1,0,2,...,19,3,0,2,3,3,2,2,2,2
615,27,0,0,0,1,443,1,0,0,3,...,11,3,3,0,6,2,0,0,0,0


In [7]:
train.to_csv("data/train.csv", index=False)
test.to_csv("data/test.csv", index=False)

#### Checking for null values

In [8]:
display(df.isnull().any())

Age                          False
Attrition                    False
Travel_Frequently            False
Travel_Rarely                False
Non_Travel                   False
DailyRate                    False
Research & Development       False
Sales                        False
Human Resources              False
DistanceFromHome             False
Education                    False
Life Sciences                False
Medical                      False
Marketing                    False
Technical Degree             False
EducationOther               False
EnvironmentSatisfaction      False
Gender                       False
HourlyRate                   False
JobInvolvement               False
JobLevel                     False
Sales Executive              False
Research Scientist           False
Laboratory Technician        False
Manufacturing Director       False
Healthcare Representative    False
JobSatisfaction              False
Married                      False
Single              

In [9]:
for col in test.columns:
    print(f"{col} : {test[col].unique()}")

Age : [23 37 43 24 58 42 46 55 28 35 29 30 51 38 53 26 36 32 33 52 54 40 44 50
 34 27 31 49 39 25 47 19 41 21 22 45 56 48 60 20 57]
Attrition : [0 1]
Travel_Frequently : [0 1]
Travel_Rarely : [1 0]
Non_Travel : [0 1]
DailyRate : [ 310 1063 1082  673  559  848  932  150  267 1172 1422  783 1450 1315
  224 1358 1178 1017  300  933 1327  447 1096  928  374  335  665 1076
 1353  535  528 1053  155 1275  367  555 1193  383  562 1442  697  954
  119 1003 1475 1401 1125 1176  594  154  790  448 1283  505  439  585
  625  853 1462  702 1333 1245  644  525  990  524  810  682  678 1474
  704  981  200 1186  655 1138  146  240 1075  792  601  604 1476  138
  346  738  868  922  201  829  111  268 1055 1278 1389  261  798  534
 1332 1421  302  719  945 1356  419  495  883 1425 1365  103  750  241
  703  570  552 1146  337 1448 1329 1167  264  640  592  773  807  688
  607  770  143  481  599 1395  986  217 1261  285  408  852 1427  946
 1360  176  507 1398  350 1291  511  916  157 1467  996  840 

In [None]:
f, axes = plt.subplots(3, 2, figsize=(10, 8))
plt.tight_layout()
sns.kdeplot(test["Age"], ax=axes[0,0])
sns.kdeplot(test["DistanceFromHome"], ax=axes[0,1])
sns.kdeplot(test["HourlyRate"], ax=axes[1,0])
sns.kdeplot(test["DailyRate"], ax=axes[1,1])
sns.kdeplot(test["MonthlyRate"], ax=axes[2,0])
sns.kdeplot(test["MonthlyIncome"], ax=axes[2,1])
plt.show()

#### Régression Logistique

In [51]:
model = LogisticRegression()
oversampler = SMOTE(random_state = 0)

train_y = train["Attrition"]
train_x = train.drop("Attrition", axis = 1)

train_x, train_y = oversampler.fit_resample(train_x, train_y)
skf = StratifiedKFold(n_splits = 5)
index = 1

for train_sample, validation_sample in skf.split(train_x, train_y):
    X_train, X_validation = [train_x.iloc[[i]] for i in train_sample], [train_x.iloc[[i]] for i in validation_sample]
    Y_train, Y_validation = [train_y[i] for i in train_sample], [train_y[i] for i in validation_sample]

    n_samples, n_x, n_y = np.shape(X_train)
    X_train = np.reshape(X_train, (n_samples, n_x * n_y))

    n_samples, n_x, n_y = np.shape(X_validation)
    X_validation = np.reshape(X_validation, (n_samples, n_x * n_y))

    # print(np.shape(X_train))

    model.fit(X_train, Y_train)
    predictions = model.predict(X_validation)

    print("Index: ", index)
    print("Predictions: ", predictions)
    print("Validation: ", Y_validation)

    index += 1

# train, validation = train_test_split(train, test_size=0.2)

# Y_validation = validation["Attrition"]
# X_validation = validation.drop("Attrition", axis = 1)

# model.fit(X_train, Y_train)
# predictions = model.predict(X_validation)

# print(predictions)
# print(Y_validation)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Index:  1
Predictions:  [0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1
 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0
 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 1
 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 0 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 1
 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 1 1 1 1 0]
Validation:  [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Index:  3
Predictions:  [0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1
 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 0 1
 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1]
Validation:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

