In [6]:
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce
pd.set_option('display.max_columns', 500)

In [7]:
zf = zipfile.ZipFile('../data/raw_data/IBM_employees.zip') 
names = [name for name in zipfile.ZipFile.namelist(zf) if '.csv' in name]
IBM_employees = pd.read_csv(zf.open(zipfile.ZipFile.namelist(zf)[0]))

In [8]:
target_column_names = ['Education', 'EnvironmentSatisfaction', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']

In [9]:
categorical_column_names = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

In [10]:
# Normalization
all_categorical_names = target_column_names.copy()
all_categorical_names.extend(categorical_column_names)
max_abs_scaler = MaxAbsScaler()
non_categorical_names = IBM_employees.columns.difference(all_categorical_names)
IBM_employees[non_categorical_names] = max_abs_scaler.fit_transform(IBM_employees[non_categorical_names])

In [11]:
# Encoding
for column_name in target_column_names:
    IBM_employees[column_name] = IBM_employees[column_name].astype('category')
    IBM_employees[column_name] = IBM_employees[column_name].cat.codes

In [12]:
for column_name in categorical_column_names:
    print(f'{column_name}: {len(IBM_employees[column_name].unique())} {IBM_employees[column_name].unique()}')

Attrition: 2 ['Yes' 'No']
BusinessTravel: 3 ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department: 3 ['Sales' 'Research & Development' 'Human Resources']
EducationField: 6 ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender: 2 ['Female' 'Male']
JobRole: 9 ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
MaritalStatus: 3 ['Single' 'Married' 'Divorced']
Over18: 1 ['Y']
OverTime: 2 ['Yes' 'No']


In [13]:
IBM_employees = IBM_employees.drop(columns=['Over18'])

In [14]:
categorical_column_names = categorical_column_names.remove('Over18')

In [15]:
# Binary encoding of categorical variable
encoder = ce.BinaryEncoder(cols=categorical_column_names, return_df=True)

IBM_employees_encoded = encoder.fit_transform(IBM_employees)

In [16]:
IBM_employees_encoded.head()

Unnamed: 0,Age,Attrition_0,Attrition_1,BusinessTravel_0,BusinessTravel_1,BusinessTravel_2,DailyRate,Department_0,Department_1,Department_2,DistanceFromHome,Education,EducationField_0,EducationField_1,EducationField_2,EducationField_3,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender_0,Gender_1,HourlyRate,JobInvolvement,JobLevel,JobRole_0,JobRole_1,JobRole_2,JobRole_3,JobRole_4,JobSatisfaction,MaritalStatus_0,MaritalStatus_1,MaritalStatus_2,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime_0,OverTime_1,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.683333,0,1,0,0,1,0.735157,0,0,1,0.034483,1,0,0,0,1,1.0,0.000484,1,0,1,0.94,0.75,0.4,0,0,0,0,1,3,0,0,1,0.299665,0.721471,0.888889,0,1,0.44,0,0,1.0,0.0,0.2,0.0,0,0.15,0.222222,0.0,0.294118
1,0.816667,1,0,0,1,0,0.186124,0,1,0,0.275862,0,0,0,0,1,1.0,0.000967,2,1,0,0.61,0.5,0.4,0,0,0,1,0,1,0,1,0,0.256513,0.922516,0.111111,1,0,0.92,1,3,1.0,0.333333,0.25,0.5,2,0.25,0.388889,0.066667,0.411765
2,0.616667,0,1,0,0,1,0.915944,0,1,0,0.068966,1,0,0,1,0,1.0,0.001934,3,1,0,0.92,0.5,0.2,0,0,0,1,1,2,0,0,1,0.104505,0.088744,0.666667,0,1,0.6,0,1,1.0,0.0,0.175,0.5,2,0.0,0.0,0.0,0.0
3,0.55,1,0,0,1,0,0.928619,0,1,0,0.103448,3,0,0,0,1,1.0,0.002418,3,0,1,0.56,0.75,0.2,0,0,0,1,0,2,0,1,0,0.145457,0.857773,0.111111,0,1,0.44,0,2,1.0,0.0,0.2,0.5,2,0.2,0.388889,0.2,0.0
4,0.45,1,0,0,0,1,0.394263,0,1,0,0.068966,0,0,0,1,1,1.0,0.003385,0,1,0,0.4,0.75,0.2,0,0,0,1,1,1,0,1,0,0.173409,0.616023,1.0,1,0,0.48,0,3,1.0,0.333333,0.15,0.5,2,0.05,0.111111,0.133333,0.117647


In [17]:
IBM_train, IBM_test= train_test_split(IBM_employees_encoded, test_size=0.2, random_state=228)

In [18]:
compression_opts_train = dict(method='zip',
                        archive_name='IBM_train.csv')  
IBM_train.to_csv('../data/preprocessed_data/IBM_train.zip', index=False, compression=compression_opts_train)

compression_opts_test = dict(method='zip',
                        archive_name='IBM_test.csv')  
IBM_test.to_csv('../data/preprocessed_data/IBM_test.zip', index=False, compression=compression_opts_test)
