In [1]:
import numpy as np 
import pandas as pd 

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
len(train), len(test)

(1176, 294)

In [4]:
train.columns

Index(['user_id', 'Age', 'Attrition', 'BusinessTravel', 'DailyRate',
       'Department', 'DistanceFromHome', 'Education', 'EducationField',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

### Data Processing

In [5]:
id_col = 'user_id'
target_col = 'Attrition'

drop_cols = ['EmployeeCount', 'EmployeeNumber','StandardHours', 'Over18', 'MonthlyRate']

digital_cols = ['MonthlyIncome', 'HourlyRate', 'MonthlyRate', 'DailyRate', 'PercentSalaryHike', 'YearsAtCompany',
                'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

category_cols = ['BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField',
                 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel',
                'JobRole', 'JobSatisfaction', 'MaritalStatus', 'OverTime',
                'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear',
                'WorkLifeBalance', 'Age']

In [6]:
for col in category_cols:
    nunique_tr = train[col].nunique()
    nunique_te = test[col].nunique()
    na_tr = len(train.loc[train[col].isna()]) / len(train)
    na_te = len(test.loc[test[col].isna()]) / len(test)
    print(f'Col name:{col:30}\tunique cate num in train:{nunique_tr:5}\tunique cate num in train:{nunique_te:5}\tnull sample in train:{na_tr:.2f}\tnull sample in test:{na_te:.2f}')

Col name:BusinessTravel                	unique cate num in train:    3	unique cate num in train:    3	null sample in train:0.00	null sample in test:0.00
Col name:Department                    	unique cate num in train:    3	unique cate num in train:    3	null sample in train:0.00	null sample in test:0.00
Col name:DistanceFromHome              	unique cate num in train:   29	unique cate num in train:   29	null sample in train:0.00	null sample in test:0.00
Col name:Education                     	unique cate num in train:    5	unique cate num in train:    5	null sample in train:0.00	null sample in test:0.00
Col name:EducationField                	unique cate num in train:    6	unique cate num in train:    6	null sample in train:0.00	null sample in test:0.00
Col name:EnvironmentSatisfaction       	unique cate num in train:    4	unique cate num in train:    4	null sample in train:0.00	null sample in test:0.00
Col name:Gender                        	unique cate num in train:    2	unique cate

In [7]:
for col in digital_cols:
    min_tr = train[col].min()
    max_tr = train[col].max()
    mean_tr = train[col].mean()
    median_tr = train[col].median()
    std_tr = train[col].std()
    
    min_te = test[col].min()
    max_te = test[col].max()
    mean_te = test[col].mean()
    median_te = test[col].median()
    std_te = test[col].std()
    
    na_tr = len(train.loc[train[col].isna()]) / len(train)
    na_te = len(test.loc[test[col].isna()]) / len(test)
    print(f'Col name:{col:30}')
    print(f'\tIn train data: min value:{min_tr:.2f}\tmax value:{max_tr:.2f}\tmean value:{mean_tr:.2f}\tmedian value:{median_tr:.2f}\tstd value:{std_tr:.2f}\tnan sample rate:{na_tr:.2f}\t')
    print(f'\tIn  test data: min value:{min_te:.2f}\tmax value:{max_te:.2f}\tmean value:{mean_te:.2f}\tmedian value:{median_te:.2f}\tstd value:{std_te:.2f}\tnan sample rate:{na_te:.2f}\t')

Col name:MonthlyIncome                 
	In train data: min value:1009.00	max value:19999.00	mean value:6458.69	median value:4850.50	std value:4724.85	nan sample rate:0.00	
	In  test data: min value:1514.00	max value:19740.00	mean value:6679.89	median value:5183.00	std value:4643.53	nan sample rate:0.00	
Col name:HourlyRate                    
	In train data: min value:30.00	max value:100.00	mean value:65.13	median value:65.00	std value:20.29	nan sample rate:0.00	
	In  test data: min value:30.00	max value:100.00	mean value:68.94	median value:70.00	std value:20.22	nan sample rate:0.00	
Col name:MonthlyRate                   
	In train data: min value:2094.00	max value:26999.00	mean value:14247.16	median value:14225.50	std value:7133.77	nan sample rate:0.00	
	In  test data: min value:2112.00	max value:26959.00	mean value:14576.88	median value:14309.00	std value:7059.40	nan sample rate:0.00	
Col name:DailyRate                     
	In train data: min value:104.00	max value:1499.00	mean va

In [8]:
train[target_col].unique()

array(['No', 'Yes'], dtype=object)

In [9]:
# bin data
# Age
age_bins = [0, 30, 40, 50, 60]
age_labels = [1, 2, 3, 4]
train['Age'] = pd.cut(train['Age'], age_bins, labels=age_labels).astype(int)
test['Age'] = pd.cut(test['Age'], age_bins, labels=age_labels).astype(int)

In [10]:
from sklearn.preprocessing import MinMaxScaler

sacalar = MinMaxScaler()
train_digital = sacalar.fit_transform(train[digital_cols])
test_digital = sacalar.transform(test[digital_cols])

In [11]:
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

train_category, test_category = None, None
# drop_cols = ['EmployeeNumber', 'Over18', 'StandardHours']
for col in [var for var in category_cols if var not in drop_cols]:
    lbe, ohe = LabelEncoder(), OneHotEncoder()
    
    lbe.fit(pd.concat([train[col], test[col]]).values.reshape(-1, 1))
    train[col] = lbe.transform(train[col])
    test[col] = lbe.transform(test[col])
    
    ohe.fit(pd.concat([train[col], test[col]]).values.reshape(-1, 1))
    oht_train = ohe.transform(train[col].values.reshape(-1, 1)).todense()
    oht_test = ohe.transform(test[col].values.reshape(-1, 1)).todense()
    
    if train_category is None:
        train_category = oht_train
        test_category = oht_test
    else:
        train_category = np.hstack((train_category, oht_train))
        test_category = np.hstack((test_category, oht_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
train_digital.shape, test_digital.shape, train_category.shape, test_category.shape

((1176, 9), (294, 9), (1176, 104), (294, 104))

In [14]:
train_features = np.hstack((train_digital, train_category))
test_features = np.hstack((test_digital, test_category))
train_features.shape, test_features.shape

((1176, 113), (294, 113))

In [15]:
target_col_dict = {'Yes': 1, 'No': 0}
train_labels = train[target_col].map(target_col_dict).values
train_labels.shape

(1176,)

In [16]:
train_features.shape

(1176, 113)

In [17]:
predictions = []

### Deep Model

In [18]:
import tensorflow as tf
from tensorflow.keras import models, layers, regularizers, callbacks

In [19]:
tf.keras.backend.clear_session()

dl_model = models.Sequential()
dl_model.add(layers.Dense(units = 32, activation='relu', input_shape=(train_features.shape[1], )))
dl_model.add(layers.Dense(units = 10, activation='relu'))
dl_model.add(layers.Dense(1, activation='sigmoid'))

dl_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                3648      
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 3,989
Trainable params: 3,989
Non-trainable params: 0
_________________________________________________________________


In [20]:
dl_model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['AUC'])

early_stop = callbacks.EarlyStopping(monitor='val_AUC', mode = 'max')

dl_model.fit(train_features, train_labels,
         batch_size=64,
         epochs = 30,
         validation_split=0.2,
         callbacks=[early_stop]
         )

Train on 940 samples, validate on 236 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


<tensorflow.python.keras.callbacks.History at 0x1de57786a48>

In [21]:
predictions.append(dl_model.predict(test_features)[0])

### Linear Model

In [22]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(train_features, train_labels)
predictions.append(lr_model.predict(test_features))

In [23]:
def normal_result(x):
    x = x if x <= 1.0 else 1.0
    x = x if x >=0 else 0.0005
    return x

In [24]:
sub = test[['user_id']].copy()
sub['Attrition'] = np.array(predictions).mean(axis=0)
sub['Attrition'] = sub['Attrition'].apply(normal_result)
sub.to_csv('submission_LR_DL.csv', index=False)