In [1]:
from exploratory import Explore, ExploreTrain
import pandas as pd

explore_admit = Explore('data/admit.csv')

df_admit, features_admit = explore_admit.read_csv()

In [None]:
#lets look at the shape of the data
df_admit.shape

In [None]:
#there are a ton of rows and only 12 features
#lets look at the features
features_admit

In [None]:
#lets look to see if there are any NA values to drop

df_admit.isna().sum()

In [None]:
#While 2008 is a lot of values, it is small compared to 100 000 rows, lets go ahead and drop the na values

df_admit = explore_admit.drop_na(df_admit)

In [None]:
#lets look at the shape one more time

df_admit.shape

In [None]:
#still a ton of rows
#lets look at the distribution of the dataset

ExploreTrain.check_distribution(df_admit, features_admit)

In [None]:
#lets examine the features more closely
explore_admit.display_chart(df_admit, features_admit, 'Admission Decision')

In [None]:
#There is clearly leakage from deposit paid as evidenced by 100% acceptance rate when deposit is paid
#lets see if there is any other leakages by making DAGS

#RL -> Admission Decision -> Deposit Paid
#EA -> Admission Decision -> Deposit Paid
#GPA -> Admission Decision -> Deposit Paid
#SAT -> Admission Decision -> Deposit Paid
#EA -> Application Type -> Application Fee -> Interview Feedback -> Admission Decision -> Deposit Paid
#GPA -> Application Type -> Application Fee -> Interview Feedback -> Admission Decision -> Deposit Paid
#SAT -> Application Type -> Application Fee -> Interview Feedback -> Admission Decision -> Deposit Paid
#RL -> Interview Feedback -> Admission Decision -> Deposit Paid
#EA -> Interview Feedback -> Admission Decision -> Deposit Paid
#GPA -> Interview Feedback -> Admission Decision -> Deposit Paid
#SAT -> Interview Feedback -> Admission Decision -> Deposit Paid

#clearly there is leakage from Deposit Paid, Application Type, Interview Feedback, Application Fee
#lets drop these features
df_admit = explore_admit.drop_columns(
    df_admit, 
    ['Deposit Paid', 'Application Type', 'Interview Feedback', 'Application Fee']
)

#lets drop the row identifiers as well
df_admit = explore_admit.drop_columns(
    df_admit,
    ['Student ID', 'First Name', 'Last Name']
)

In [None]:
#lets now explore with pycaret 
ExploreTrain.pycaret_explore(df_admit, 'Admission Decision')

In [None]:
#the class report shows a difference in recall between target classes
# lets look at the dataset more closely particularly the target variable

df_admit['Admission Decision'].value_counts()

In [None]:
#there is clearly some imbalance in the dataset, because of this the best usage of classification 
#would be ada boost as it is great for imbalanced datasets and has the best F1, kappa, and MCC score
#lets look more carefully using ada boost
ExploreTrain.use_specific_model_pycaret(df_admit, 'Admission Decision', 'ada')

In [None]:
#the metrics based on the classification may not be as good for accepting students (0 target) but 
#because the precision and recall is so high for denied students 0.765 and 0.915 for precision and recall (1 target)
#it is clear that the model is very good at predicting denied students 
#this is likely due to the imbalance in the dataset, so we cannot use it to predict accepted students
#with a precision and recall of (0.542 and 0.262 respectively)

#this concludes our analysis of the dataset