In [35]:
import pandas as pd
from exploratory import Explore
from exploratory import ExploreTrain

# Load the data
explore_promote = Explore('data/promote.csv')
df_promote, features_promote = explore_promote.read_csv()

In [36]:
#check length of df_promote before dropping
print(df_promote.shape)

df_promote = explore_promote.drop_na(df_promote)

#check length of df_promote after dropping
df_promote.shape

(100, 17)


(100, 17)

In [37]:
#seems like very few rows and many columns
#lets check the classes of the target variable
df_promote['Promoted'].value_counts()


Promoted
1    74
0    26
Name: count, dtype: int64

In [38]:
#it is quite imbalancedand very few people not
#nonetheless we should explore a bit more
#display bar charts of each feature mean for target variable
explore_promote.display_chart(df_promote, features_promote, 'Promoted')


In [39]:
#lets see the columns in our dataset
df_promote.columns



Index(['Employee ID', 'Name', 'Date of Birth', 'Hire Date', 'Age',
       'Years at Company', 'Role', 'Department', 'Email', 'Phone', 'Address',
       'Gender', 'Salary', 'Previous Company', 'Degree Earned', 'Skills',
       'Promoted'],
      dtype='object')

In [40]:
#we can drop Employee ID, Name, Date of Birth, Hire Date, Email, Phone, Address
#because they are either row identifiers or repetitive columns
extra_columns = ['Employee ID', 'Name', 'Date of Birth', 'Hire Date', 'Email', 'Phone', 'Address']
df_promote = explore_promote.drop_columns(df_promote, columns = extra_columns)

#there may be a few leakages, lets try making DAGs
#Years at Company -> Skills -> Promoted -> Salary
#Role -> Skills -> Promoted -> Salary -> Role -> Department
#Years at Company -> Role -> Promoted -> Salary
#Degree Earned-> skills -> Promoted -> Skills
#Role -> Salary -> Promoted
#Age -> Salary -> Promoted
#Age -> Promoted
#Degree Earned -> Role -> Years at Company -> Promoted 

#Salary  and skills is clearly a leakage, role and department maybe leakages as well
#Hire date and age may play similar roles so we should drop them as well

df_promote = explore_promote.drop_columns(df_promote, columns = ['Salary', 'Age', 'Skills', 'Role', 'Department'])


In [41]:
#lets use pycaret to explore the data
ExploreTrain.pycaret_explore(df = df_promote, target = 'Promoted')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Promoted
2,Target type,Binary
3,Original data shape,"(100, 5)"
4,Transformed data shape,"(100, 9)"
5,Transformed train set shape,"(70, 9)"
6,Transformed test set shape,"(30, 9)"
7,Numeric features,1
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.087
dt,Decision Tree Classifier,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.082
rf,Random Forest Classifier,0.7429,0.5767,1.0,0.7429,0.8513,0.0,0.0,0.252
ada,Ada Boost Classifier,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.165
gbc,Gradient Boosting Classifier,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.165
lda,Linear Discriminant Analysis,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.084
et,Extra Trees Classifier,0.7429,0.6367,1.0,0.7429,0.8513,0.0,0.0,0.26
lightgbm,Light Gradient Boosting Machine,0.7429,0.5117,1.0,0.7429,0.8513,0.0,0.0,0.212
dummy,Dummy Classifier,0.7429,0.5,1.0,0.7429,0.8513,0.0,0.0,0.145
lr,Logistic Regression,0.7286,0.7017,0.98,0.7381,0.8407,-0.0235,-0.0258,0.178


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8571,0.5,1.0,0.8571,0.9231,0.0,0.0
1,0.8571,0.5,1.0,0.8571,0.9231,0.0,0.0
2,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
3,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
4,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
5,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
6,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
7,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
8,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0
9,0.7143,0.5,1.0,0.7143,0.8333,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [42]:
#it seems like we have an abnormally high Recall score because of the imbalance in the target variable
#The AUC is also all 0.5 meaning it is only as good as random chance
#Ridge classifier gives a decent AUC and F1 Score but there is clearly something wrong when looking more closely
#at the classification report - the precision, recall, f1, and support are too low due to a lack of data
#with unpromoted individuals

#Thus we recommend not to machine learn on this dataset as it is imbalanced with too few data points
#and not good enough features to predict the target variable
