In [None]:
!pip install pycaret

In [75]:
from pycaret.clustering import *
from pycaret.classification import *
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score
from scipy import stats

In [3]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
final_df = pd.read_csv('/content/final_df.csv')

# Unsupervised Algorithm

In [None]:
emp = list(train_df['Emp_ID'].value_counts().keys())

In [None]:
def features_agg(tab,df):
    feat = ['Salary','Total Business Value']
    for col in feat:
        df[f'min_{col}'] = np.min(tab[col])
        df[f'max_{col}'] = np.max(tab[col])
        df[f'sum_{col}'] = np.sum(tab[col])
        df[f'mean_{col}'] = np.mean(tab[col])
        df[f'std_{col}'] = np.std(tab[col])
        df[f'latest_{col}'] = tab[col][len(tab)-1]

    df['change_in_age'] = tab['Age'][len(tab)-1]-tab['Age'][0]      

    return df

In [None]:
def dates_feat(tab,df):
    latest_reporting = pd.to_datetime(tab['MMM-YY'][len(tab)-1])
    joined = pd.to_datetime(tab['Dateofjoining'][0])
    diff = (latest_reporting-joined).days
    if diff > 0 :
      df['days_diff'] = diff
    else :
      df['days_diff'] = 0
    return df

In [None]:
def bow(tab,df):
    for i in range(1,5):
        df[f'qr_{i}']= sum(tab['Quarterly Rating']==i)
    df['latest_qr'] = tab['Quarterly Rating'][len(tab)-1]
    return df

In [None]:
final_df = None
for id in tqdm(emp):
     tab = train_df[train_df['Emp_ID']==id].reset_index(drop=True)
     df = pd.DataFrame({'Emp_ID':[id],'Gender':[tab['Gender'][0]],'City':[tab['City'][0]],'Joining Designation':tab['Joining Designation'][0]})
     df['latest_Education_Level'] = tab['Education_Level'][len(tab)-1]
     df['if_education_changed']= 1 if tab['Education_Level'].nunique()>1 else 0
     df = dates_feat(tab,df)
     df = features_agg(tab,df)
     df['latest_designation'] = tab['Designation'][len(tab)-1]
     df['num_of_designation_held'] = tab['Designation'].nunique()
     df = bow(tab,df)
     df['is_leaving'] = 0 if tab['LastWorkingDate'].isnull().sum() == len(tab) else 1
  
     if final_df is None:
        final_df = df
     else:
        final_df = pd.concat([final_df,df],axis=0) 

100%|██████████| 2381/2381 [00:47<00:00, 50.19it/s]


In [None]:
final_df.to_csv('final_df.csv',index=False)

In [None]:
exp = setup(final_df, ignore_features = ['Emp_ID'],numeric_features=['num_of_designation_held','change_in_age'],session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(2381, 28)"
2,Missing Values,False
3,Numeric Features,15
4,Categorical Features,12
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(2381, 143)"
9,CPU Jobs,-1


In [None]:
kmeans = create_model('kmeans',num_clusters=2)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.82,5317.0897,0.478,0,0,0


In [None]:
kmeans_res = assign_model(kmeans)

In [None]:
submission = pred(test_df,kmeans_res)

In [None]:
def pred(test_df,df):
    emp = list(test_df['Emp_ID'])
    prediction = []
    
    for id in emp:
        
        cl = df[df['Emp_ID']==id]['Cluster'].values
        if cl==['Cluster 0']:
           prediction.append(1)
        else :
           prediction.append(0)
        
    return pd.DataFrame({'Emp_ID':test_df['Emp_ID'],'Target':prediction}) 

In [None]:
submission.to_csv('submission_meanshift.csv',index=False)

In [None]:
kmodes = create_model('kmodes',num_clusters=2)
kmodes_res = assign_model(kmodes)
submission = pred(test_df,kmodes_res)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6345,1601.0189,0.892,0,0,0


In [None]:
hclust = create_model('hclust',num_clusters=2)
hclust_res = assign_model(hclust)
submission = pred(test_df,hclust_res)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8217,4590.8627,0.4357,0,0,0


In [None]:
meanshift	 = create_model('meanshift',num_clusters=2)
meanshift_res = assign_model(meanshift)
submission = pred(test_df,meanshift_res)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6683,2013.5102,0.6573,0,0,0


In [None]:
birch = create_model('birch',num_clusters=2)
birch_res = assign_model(birch)
submission = pred(test_df,birch_res)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8217,4590.8627,0.4357,0,0,0


In [None]:
def metric_eval(res_df):
    left_emp = list(final_df[final_df['is_leaving']==1]['Emp_ID'])
    left_emp = pd.DataFrame({'Emp_ID':left_emp})
    prediction = pred(left_emp,res_df)
    y_true = np.ones(len(prediction))
    print("Accuracy:",accuracy_score(y_true,prediction.Target))
    print("F1 Score:",f1_score(y_true,prediction.Target,average='macro')) 

  

In [None]:
metric_eval(kmeans_res)

Accuracy: 0.9783415841584159
F1 Score: 0.49452611823584613


In [None]:
metric_eval(kmodes_res)

Accuracy: 0.9591584158415841
F1 Score: 0.4895767530006317


In [None]:
metric_eval(hclust_res)

Accuracy: 0.9882425742574258
F1 Score: 0.49704326174914415


In [None]:
metric_eval(birch_res)

Accuracy: 0.9882425742574258
F1 Score: 0.49704326174914415


In [None]:
metric_eval(meanshift_res)

Accuracy: 0.932549504950495
F1 Score: 0.48254883125200126


In [None]:
submission = pred(test_df,meanshift_res)

# Supervised Algorithm


In [21]:
exp_clf= setup(final_df, target = 'is_leaving',numeric_features=['num_of_designation_held','change_in_age'], session_id=123) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,is_leaving
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(2381, 28)"
5,Missing Values,False
6,Numeric Features,16
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


In [57]:
knn = create_model('knn')


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8563,0.8642,0.9381,0.8618,0.8983,0.6549,0.6618
1,0.7904,0.7796,0.9204,0.8,0.856,0.4782,0.4943
2,0.8683,0.881,0.9823,0.8473,0.9098,0.6703,0.6961
3,0.7784,0.7975,0.885,0.8065,0.8439,0.4653,0.4712
4,0.8204,0.8112,0.9027,0.843,0.8718,0.573,0.5767
5,0.8204,0.8319,0.9027,0.843,0.8718,0.573,0.5767
6,0.8072,0.8201,0.9115,0.824,0.8655,0.5282,0.5366
7,0.8012,0.7738,0.8938,0.8279,0.8596,0.5211,0.5256
8,0.7952,0.8056,0.9115,0.811,0.8583,0.4933,0.5044
9,0.7892,0.8211,0.8839,0.8182,0.8498,0.498,0.5022


In [58]:
tuned_knn = tune_model(knn, custom_grid = {'n_neighbors' : np.arange(3,25,1)})

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8563,0.8642,0.9381,0.8618,0.8983,0.6549,0.6618
1,0.7904,0.7796,0.9204,0.8,0.856,0.4782,0.4943
2,0.8683,0.881,0.9823,0.8473,0.9098,0.6703,0.6961
3,0.7784,0.7975,0.885,0.8065,0.8439,0.4653,0.4712
4,0.8204,0.8112,0.9027,0.843,0.8718,0.573,0.5767
5,0.8204,0.8319,0.9027,0.843,0.8718,0.573,0.5767
6,0.8072,0.8201,0.9115,0.824,0.8655,0.5282,0.5366
7,0.8012,0.7738,0.8938,0.8279,0.8596,0.5211,0.5256
8,0.7952,0.8056,0.9115,0.811,0.8583,0.4933,0.5044
9,0.7892,0.8211,0.8839,0.8182,0.8498,0.498,0.5022


In [59]:
res = predict_model(tuned_knn,data=final_df)

In [60]:
prediction = []
for id in list(test_df['Emp_ID']):
    prediction.append(int(res[res['Emp_ID']==id]['Label']))

submission = pd.DataFrame({'Emp_ID':test_df['Emp_ID'],'Target':prediction})

In [63]:
submission.to_csv('submission_knn_tuned.csv',index=False)

In [62]:
submission.Target.value_counts()

0    495
1    246
Name: Target, dtype: int64

# Ensemble Voting Model

In [64]:
kmodes_df = pd.read_csv('/content/submission_kmodes.csv')
meanshift_df = pd.read_csv('/content/submission_meanshift.csv')
knn_df = pd.read_csv('/content/submission_knn.csv')

In [68]:
kmodes_df = kmodes_df.rename(columns={'Target':'kmodes'})
meanshift_df=meanshift_df.rename(columns={'Target':'mean'})
knn_df=knn_df.rename(columns={'Target':'knn'})

In [73]:
df = pd.merge(kmodes_df,meanshift_df,on=['Emp_ID'])
df = pd.merge(df,knn_df,on=['Emp_ID'])

In [85]:
k = df.drop(columns=['Emp_ID']).values
m = stats.mode(k,axis=1)

In [92]:
submission = pd.DataFrame({'Emp_ID':df.Emp_ID,'Target':m[0].reshape(-1)})

In [95]:
submission.to_csv('submission_voting.csv',index=False)

# Anomaly Detection


In [96]:
from pycaret.anomaly import *

In [102]:
exp = setup(final_df,numeric_features=['num_of_designation_held','change_in_age'],ignore_features=['Emp_ID'])

Unnamed: 0,Description,Value
0,session_id,7508
1,Original Data,"(2381, 28)"
2,Missing Values,False
3,Numeric Features,15
4,Categorical Features,12
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(2381, 143)"
9,CPU Jobs,-1


In [148]:
iforest = create_model('iforest')

In [144]:
_res = assign_model(pca)

In [145]:
for i in range(len(pca_res)):
    if pca_res['Anomaly'][i]==0:
       pca_res['Anomaly'][i] = 1
    else:
       pca_res['Anomaly'][i] = 0

In [146]:
pca_res['Anomaly'].value_counts()

1    2262
0     119
Name: Anomaly, dtype: int64

In [147]:
print('Accuracy:',accuracy_score(pca_res['is_leaving'],pca_res['Anomaly']))

Accuracy: 0.6816463670726586


In [158]:
def score_model(model):
    model = create_model(model)
    res = assign_model(model)
    for i in range(len(res)):
        if res['Anomaly'][i]==0:
            res['Anomaly'][i] = 1
        else:
            res['Anomaly'][i] = 0
    print('Accuracy:',accuracy_score(res['is_leaving'],res['Anomaly']))
    return res

In [151]:
score_model('knn')

Accuracy: 0.7051658966820663


In [152]:
score_model('iforest')

Accuracy: 0.7169256614867703


In [153]:
score_model('cluster')

Accuracy: 0.7018059638807224


In [154]:
score_model('histogram')

Accuracy: 0.7202855942881142


In [155]:
score_model('lof')

Accuracy: 0.6614867702645947


In [156]:
score_model('sos')

Accuracy: 0.6631667366652667


In [157]:
score_model('svm')

Accuracy: 0.6787064258714826


In [159]:
df = score_model('histogram')

Accuracy: 0.7202855942881142


In [161]:
prediction = []
for id in list(test_df['Emp_ID']):
    prediction.append(int(df[df['Emp_ID']==id]['Anomaly']))

submission = pd.DataFrame({'Emp_ID':test_df['Emp_ID'],'Target':prediction})

In [164]:
submission.to_csv('submission_hist_anomaly.csv',index=False)