In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('./data/clean_one_hot_data.csv')

In [4]:
df.drop(df.columns[0], axis=1,inplace=True)
df.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,distance_from_home,education,environment_satisfaction,gender,hourly_rate,job_involvement,...,job_role_sales representative,department_research & development,department_sales,education_field_life sciences,education_field_marketing,education_field_medical,education_field_other,education_field_technical degree,marital_status_married,marital_status_single
0,41,1,1,1102,1,2,2,0,94,3,...,0,0,1,1,0,0,0,0,0,1
1,49,0,2,279,8,1,3,1,61,2,...,0,1,0,1,0,0,0,0,1,0
2,37,1,1,1373,2,2,4,1,92,2,...,0,1,0,0,0,0,1,0,0,1
3,33,0,2,1392,3,4,4,0,56,3,...,0,1,0,1,0,0,0,0,1,0
4,27,0,1,591,2,1,1,1,40,3,...,0,1,0,0,0,1,0,0,1,0


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [12]:
scaler = StandardScaler()

In [11]:
features = [['age', 'daily_rate', 'distance_from_home', 'hourly_rate', 'monthly_income', 'monthly_rate', 'percent_salary_hike', 
             'total_working_years', 'training_times_last_year', 'years_at_company', 'years_in_current_role', 
             'years_since_last_promotion', 'years_with_curr_manager']]

In [13]:
for feature in features:
    df[feature] = scaler.fit_transform(df[feature])

In [16]:
# Rule of Thumb: k = square root of df rows of 1470 which equals ~38
knn = KNeighborsClassifier(n_neighbors=38)

In [15]:
y = df['attrition']
X = df.drop(columns=['attrition'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [19]:
print(accuracy_score(y_test, y_pred))

0.8695652173913043


In [20]:
roc_auc_score(y_test, y_pred)

0.5

In [21]:
# Tuning Hyperparameters

leaf_size = list(range(1,50))
n_neighbors = list(range(1,50))
p=[1,2]

# convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

# Making model
clf = GridSearchCV(knn, hyperparameters, cv=10)
best_model = clf.fit(X_train,y_train)

#Best Hyperparameters Value
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

#Predict testing set
y_pred = best_model.predict(X_test)

#Check performance using accuracy
print(accuracy_score(y_test, y_pred))

#Check performance using ROC
roc_auc_score(y_test, y_pred)

Best leaf_size: 1
Best p: 1
Best n_neighbors: 8


NameError: name 'x_test' is not defined

In [25]:
# Trying the best parameters shown above to see how it performs

def knn_model_std_scaler(df):
    scaler = StandardScaler()
    
    # Compiling all features to have standard scale
    
    features = [['age', 'daily_rate', 'distance_from_home', 'hourly_rate', 'monthly_income', 'monthly_rate', 'percent_salary_hike', 
             'total_working_years', 'training_times_last_year', 'years_at_company', 'years_in_current_role', 
             'years_since_last_promotion', 'years_with_curr_manager']]
    
    for feature in features:
        df[feature] = scaler.fit_transform(df[feature])
    
    # Rule of Thumb: k = square root of df rows of 1470 which equals ~38
    knn = KNeighborsClassifier(n_neighbors=8, p=1, leaf_size=1)
    
    y = df['attrition']
    X = df.drop(columns=['attrition'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    accuracy_score_ = accuracy_score(y_test, y_pred)
    roc_auc_score_ = roc_auc_score(y_test, y_pred)
    
    return f'Accuracy Score: {accuracy_score_}', f'ROC_AUC_Score: {roc_auc_score_}'

In [26]:
df = pd.read_csv('./data/clean_one_hot_data.csv')
df.drop(df.columns[0], axis=1,inplace=True)
df.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,distance_from_home,education,environment_satisfaction,gender,hourly_rate,job_involvement,...,job_role_sales representative,department_research & development,department_sales,education_field_life sciences,education_field_marketing,education_field_medical,education_field_other,education_field_technical degree,marital_status_married,marital_status_single
0,41,1,1,1102,1,2,2,0,94,3,...,0,0,1,1,0,0,0,0,0,1
1,49,0,2,279,8,1,3,1,61,2,...,0,1,0,1,0,0,0,0,1,0
2,37,1,1,1373,2,2,4,1,92,2,...,0,1,0,0,0,0,1,0,0,1
3,33,0,2,1392,3,4,4,0,56,3,...,0,1,0,1,0,0,0,0,1,0
4,27,0,1,591,2,1,1,1,40,3,...,0,1,0,0,0,1,0,0,1,0


In [28]:
knn_model_std_scaler(df)

('Accuracy Score: 0.8777173913043478', 'ROC_AUC_Score: 0.5401041666666666')