In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data_mult = pd.read_csv('mult.csv')
data_train = pd.read_csv('train.csv')
data_train_len = data_train.shape[0]
y = data_train.pop('Attrition')

train = data_mult.iloc[:data_train_len, :].to_numpy()
test = data_mult.iloc[data_train_len:, :].to_numpy()
train.shape, test.shape

((1340, 2304), (336, 2304))

In [4]:
def y_enco(y):
    if y == 'No':
        return 0
    elif y == 'Yes':
        return 1
    else:
        pass

y_encoder = y.apply(y_enco)

In [5]:
from imblearn.over_sampling import ADASYN 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Data Normalization

# ada = ADASYN(random_state=42)
# X_res, y_res = ada.fit_resample(X_norm, y_encorder)
# X_res.shape

X_train, X_test, y_train, y_test = train_test_split(train, y_encoder, test_size=0.1, random_state=42)


scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
X_train_norm = scaler.transform(X_train)


ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X_train_norm, y_train)
X_res.shape, y_res.shape

((2115, 2304), (2115,))

In [6]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

select = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
select.fit(X_res, y_res)
X_new = select.transform(X_res)
X_new.shape

(2115, 319)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

# # Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1000, num = 6)]
# # Number of features to consider at every split
max_features = ['sqrt']
# # Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(4, 20, num = 9)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# # Method of selecting samples for training each tree
bootstrap = [False]
# # Create the random grid
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# params = {'n_estimators': n_estimators,
#             'max_depth': max_depth,
#             'min_samples_split': min_samples_split,
#             'min_samples_leaf': min_samples_leaf}

grid_search = GridSearchCV(estimator = RandomForestClassifier(),
                           param_grid=params,
                           cv = 10,
                           n_jobs=-1,
                           verbose=0,
                           scoring="f1")

# Training
grid_search.fit(X_new, y_res)

# Best Performing Parameter
print('='*20)
print("best params: " + str(grid_search.best_estimator_))
print("best params: " + str(grid_search.best_params_))
print('best score:', grid_search.best_score_)
print('='*20)

best params: RandomForestClassifier(bootstrap=False, max_depth=16, min_samples_leaf=2,
                       n_estimators=500)
best params: {'bootstrap': False, 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
best score: 0.968474304384603


In [8]:
from sklearn.metrics import f1_score

X_val_norm = scaler.transform(X_test)
X_val_new = select.transform(X_val_norm)

y_pred = grid_search.predict(X_val_new)

f1_score(y_test, y_pred)

0.7826086956521738

In [9]:
X_test_norm = scaler.transform(test)
X_test_new = select.transform(X_test_norm)


y_pred = grid_search.predict(X_test_new)


id = range(0,len(y_pred))
y_test = pd.DataFrame()
y_test['Id'] = id
y_test['Predicted'] = y_pred
y_test.to_csv('pred_rf_ad_lasso.csv', index = 0)