In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prince import MCA
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [84]:
admit_data = pd.read_csv("../datasets/Admission and Test Scores/ADM_2015-2021_data.csv", low_memory=False)

In [73]:
missing = admit_data.isna().sum()
missing /= admit_data.shape[0]
missing *=100
missing = missing.to_frame().rename(columns={0:'Percent Of Missing Values'})
missing

Unnamed: 0,Percent Of Missing Values
unitid,0.0
admcon1,0.0
admcon2,0.0
admcon3,0.0
admcon4,0.0
admcon5,0.0
admcon6,0.0
admcon7,0.0
admcon8,0.0
admcon9,0.0


In [181]:
admit_data = admit_data.dropna(subset=['admcon9'])
categorical = admit_data[['unitid', 'admcon1', 'admcon2', 'admcon3', 'admcon4', 'admcon5', 'admcon6', 'admcon7', 'admcon8', 'admcon9']]
categorical_unique = categorical.drop_duplicates(subset='unitid')


In [87]:
categorical_unique_wo_id = categorical_unique.drop('unitid', axis=1)

In [88]:
for c in categorical_unique_wo_id.columns:
    print(f"Column: {c}, nunique: {categorical_unique_wo_id[c].nunique()}")

Column: admcon1, nunique: 4
Column: admcon2, nunique: 4
Column: admcon3, nunique: 4
Column: admcon4, nunique: 4
Column: admcon5, nunique: 4
Column: admcon6, nunique: 5
Column: admcon7, nunique: 4
Column: admcon8, nunique: 4
Column: admcon9, nunique: 5


In [91]:
mca = MCA(n_components = 20, n_iter = 3, random_state = 101)
mca.fit(categorical_unique_wo_id)
cat_mca = mca.transform(categorical_unique_wo_id)
cat_mca = pd.concat([categorical_unique['unitid'], cat_mca], axis = 1)
print(f"Explained variance: {np.sum(mca.explained_inertia_)}")


Explained variance: 0.8309043967287372


In [239]:
numerical_data = admit_data.drop(['admcon1', 'admcon2', 'admcon3', 'admcon4', 'admcon5', 'admcon6', 'admcon7', 'admcon8', 'admcon9'], axis = 1, inplace = False)
numerical_data_unique = numerical_data.groupby(['unitid']).mean()
numerical_data_unique.dropna(subset=['admssn'])

Unnamed: 0_level_0,applcn,applcnm,applcnw,admssn,admssnm,admssnw,enrlt,enrlm,enrlw,enrlft,...,actcm25,actcm75,acten25,acten75,actmt25,actmt75,satwr25,satwr75,actwr25,year
unitid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,8455.142857,2970.571429,5483.714286,7153.428571,2449.428571,4703.142857,1537.142857,632.000000,904.857143,1512.142857,...,15.571429,19.285714,14.000000,20.000000,15.000000,18.0,370.0,457.0,,2018.0
100663,9018.000000,3224.142857,5793.857143,7046.428571,2510.428571,4536.000000,2165.000000,799.000000,1366.000000,2111.571429,...,21.714286,28.857143,22.142857,31.428571,19.857143,26.0,,,,2018.0
100706,4817.000000,2572.857143,2244.142857,3791.857143,2137.714286,1654.142857,1297.714286,817.714286,480.000000,1286.714286,...,24.571429,30.857143,24.428571,33.000000,23.571429,29.0,,,,2018.0
100724,7387.000000,2329.571429,4957.714286,5943.285714,1805.571429,4039.285714,1012.285714,369.714286,642.571429,980.428571,...,15.285714,19.571429,13.857143,19.857143,14.714286,17.0,,,,2018.0
100751,38622.428571,14712.428571,23910.000000,25567.285714,9786.000000,15781.285714,7100.571429,3029.000000,4071.571429,7067.714286,...,22.571429,31.142857,22.571429,33.000000,20.857143,29.0,480.0,600.0,7.0,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496681,599.000000,29.000000,570.000000,185.000000,7.000000,178.000000,185.000000,7.000000,178.000000,185.000000,...,,,,,,,,,,2021.0
496973,27.000000,3.000000,24.000000,26.000000,3.000000,23.000000,18.000000,2.000000,16.000000,17.000000,...,,,,,,,,,,2021.0
497037,41.000000,3.000000,38.000000,41.000000,3.000000,38.000000,32.000000,2.000000,30.000000,23.000000,...,,,,,,,,,,2021.0
497213,4.000000,2.000000,2.000000,4.000000,2.000000,2.000000,3.000000,1.000000,2.000000,2.000000,...,,,,,,,,,,2021.0


In [104]:
def get_missing(df):
    missing = df.isna().sum()
    missing /= df.shape[0]
    missing *= 100
    missing = missing.to_frame().rename(columns={0: 'Percent Missing Values'})
    return missing

In [264]:
numerical_data_unique_clean = numerical_data_unique.copy()

In [197]:
def backcalc_and_update_mwt(df, w, m, t):

    df[w].fillna(df[t] - df[m], inplace = True)
    df[m].fillna(df[t] - df[w], inplace = True)
    df[t].fillna(df[m] + df[w], inplace = True)


In [155]:
def update_sat_act(df, sat, act):
    
    df[sat].fillna(pd.Series(np.int32(1200/36 * df[act] + 400)), inplace = True)
    df[act].fillna(pd.Series(np.int32(36.0/1600.0 * df[sat])), inplace = True)

In [265]:
backcalc_and_update_mwt(numerical_data_unique_clean, 'admssnw', 'admssnm', 'admssn')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlw', 'enrlm', 'enrlt')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlftw', 'enrlftm', 'enrlft')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlptw', 'enrlptm', 'enrlpt')

numerical_data_unique_clean['enrlptw'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlpt'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlptm'].fillna(0, inplace=True)

numerical_data_unique_clean['enrlftw'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlft'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlftm'].fillna(0, inplace=True)

numerical_data_unique_clean['enrlt'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlm'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlw'].fillna(0, inplace=True)

numerical_data_unique_clean['admssn'].fillna(0, inplace=True)
numerical_data_unique_clean['admssnm'].fillna(0, inplace=True)
numerical_data_unique_clean['admssnw'].fillna(0, inplace=True)

numerical_data_unique_clean.drop('satnum', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actnum', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actwr25', axis = 1, inplace = True)
numerical_data_unique_clean.drop('satwr75', axis = 1, inplace = True)
numerical_data_unique_clean.drop('satwr25', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actmt75', axis = 1, inplace = True)

update_sat_act(numerical_data_unique_clean, 'satvr25', 'actcm25')
update_sat_act(numerical_data_unique_clean, 'satvr75', 'actcm75')

In [266]:
imp = IterativeImputer(missing_values=np.nan, estimator = RandomForestRegressor(n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,), tol = 1e-2, max_iter = 25)

imp.fit(numerical_data_unique_clean)


In [267]:
numerical_data_unique_clean_imp = pd.DataFrame(imp.transform(numerical_data_unique_clean), columns=numerical_data_unique_clean.columns, index = numerical_data_unique_clean.index)

In [268]:
# cat_mca.reset_index()
numerical_data_unique_clean_imp.reset_index(inplace=True)

In [259]:
cat_mca.index.name = 'index'
numerical_data_unique_clean_imp.index.name = 'smh'

In [269]:
numerical_data_unique_clean_imp.columns

Index(['unitid', 'applcn', 'applcnm', 'applcnw', 'admssn', 'admssnm',
       'admssnw', 'enrlt', 'enrlm', 'enrlw', 'enrlft', 'enrlftm', 'enrlftw',
       'enrlpt', 'enrlptm', 'enrlptw', 'satpct', 'actpct', 'satvr25',
       'satvr75', 'satmt25', 'satmt75', 'actcm25', 'actcm75', 'acten25',
       'acten75', 'actmt25', 'year'],
      dtype='object')

In [250]:
def print_id_row(df):
    print(f"Rows: {df.shape[0]}, nunique: {df['unitid'].nunique()}")

In [270]:
print_id_row(cat_mca)
print_id_row(numerical_data_unique_clean_imp)

Rows: 2487, nunique: 2487
Rows: 2487, nunique: 2487


In [271]:
all_features = pd.merge(cat_mca, numerical_data_unique_clean_imp, on = 'unitid')

In [273]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(all_features)
normalized_all_features = scaler.transform(all_features)



In [274]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
pca.fit(normalized_all_features)
print(np.sum(pca.explained_variance_ratio_))
print(pca.explained_variance_ratio_)

0.7615361673939105
[0.29200767 0.13068411 0.05478574 0.03891265 0.03091534 0.02603826
 0.02110753 0.02100074 0.0209537  0.02091426 0.02085635 0.02084519
 0.02084045 0.02083729 0.02083688]


In [275]:
pca_all_features = pca.transform(normalized_all_features)