In [9]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.decomposition import PCA
from pca import pca
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pickle

In [5]:
def backcalc_and_update_mwt(df, w, m, t):
    df[w].fillna(df[t] - df[m], inplace = True)
    df[m].fillna(df[t] - df[w], inplace = True)
    df[t].fillna(df[m] + df[w], inplace = True)

def update_sat_act(df, sat, act):
    df[sat].fillna(pd.Series(np.int32(1200/36 * df[act] + 400)), inplace = True)
    df[act].fillna(pd.Series(np.int32(36.0/1600.0 * df[sat])), inplace = True)

In [6]:
admit_data = pd.read_csv("datasets/Admission and Test Scores/ADM_2015-2021_data.csv")
numerical_data = admit_data.drop(['admcon1', 'admcon2', 'admcon3', 'admcon4', 'admcon5', 'admcon6', 'admcon7', 'admcon8', 'admcon9'], axis = 1, inplace = False)
numerical_data_unique = numerical_data.groupby(['unitid']).mean()
numerical_data_unique.dropna(subset=['admssn'])
numerical_data_unique_clean = numerical_data_unique.copy()

In [7]:
backcalc_and_update_mwt(numerical_data_unique_clean, 'admssnw', 'admssnm', 'admssn')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlw', 'enrlm', 'enrlt')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlftw', 'enrlftm', 'enrlft')
backcalc_and_update_mwt(numerical_data_unique_clean, 'enrlptw', 'enrlptm', 'enrlpt')

numerical_data_unique_clean['enrlptw'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlpt'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlptm'].fillna(0, inplace=True)

numerical_data_unique_clean['enrlftw'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlft'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlftm'].fillna(0, inplace=True)

numerical_data_unique_clean['enrlt'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlm'].fillna(0, inplace=True)
numerical_data_unique_clean['enrlw'].fillna(0, inplace=True)

numerical_data_unique_clean['admssn'].fillna(0, inplace=True)
numerical_data_unique_clean['admssnm'].fillna(0, inplace=True)
numerical_data_unique_clean['admssnw'].fillna(0, inplace=True)

numerical_data_unique_clean.drop('satnum', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actnum', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actwr25', axis = 1, inplace = True)
numerical_data_unique_clean.drop('satwr75', axis = 1, inplace = True)
numerical_data_unique_clean.drop('satwr25', axis = 1, inplace = True)
numerical_data_unique_clean.drop('actmt75', axis = 1, inplace = True)

update_sat_act(numerical_data_unique_clean, 'satvr25', 'actcm25')
update_sat_act(numerical_data_unique_clean, 'satvr75', 'actcm75')

In [10]:
imp = IterativeImputer(missing_values=np.nan, estimator = RandomForestRegressor(n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,), tol = 1e-2, max_iter = 25)

imp.fit(numerical_data_unique_clean)

In [11]:
numerical_data_unique_clean_imp = pd.DataFrame(imp.transform(numerical_data_unique_clean), columns=numerical_data_unique_clean.columns, index = numerical_data_unique_clean.index)
numerical_data_unique_clean_imp.reset_index(inplace=True)
numerical_data_unique_clean_imp.index.name = 'index'
numerical_data_unique_clean_imp

Unnamed: 0_level_0,unitid,applcn,applcnm,applcnw,admssn,admssnm,admssnw,enrlt,enrlm,enrlw,...,satvr25,satvr75,satmt25,satmt75,actcm25,actcm75,acten25,acten75,actmt25,year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100654,8455.142857,2970.571429,5483.714286,7153.428571,2449.428571,4703.142857,1537.142857,632.000000,904.857143,...,406.857143,499.285714,391.428571,495.000000,15.571429,19.285714,14.000000,20.000000,15.000000,2018.0
1,100663,9018.000000,3224.142857,5793.857143,7046.428571,2510.428571,4536.000000,2165.000000,799.000000,1366.000000,...,520.428571,649.714286,527.142857,677.285714,21.714286,28.857143,22.142857,31.428571,19.857143,2018.0
2,100706,4817.000000,2572.857143,2244.142857,3791.857143,2137.714286,1654.142857,1297.714286,817.714286,480.000000,...,562.428571,684.285714,551.428571,704.285714,24.571429,30.857143,24.428571,33.000000,23.571429,2018.0
3,100724,7387.000000,2329.571429,4957.714286,5943.285714,1805.571429,4039.285714,1012.285714,369.714286,642.571429,...,412.857143,498.571429,392.571429,493.571429,15.285714,19.571429,13.857143,19.857143,14.714286,2018.0
4,100751,38622.428571,14712.428571,23910.000000,25567.285714,9786.000000,15781.285714,7100.571429,3029.000000,4071.571429,...,525.714286,641.428571,515.714286,650.000000,22.571429,31.142857,22.571429,33.000000,20.857143,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2482,496681,599.000000,29.000000,570.000000,185.000000,7.000000,178.000000,185.000000,7.000000,178.000000,...,496.325407,590.285204,482.893398,583.275964,20.037554,25.241336,19.000340,25.217433,18.294914,2021.0
2483,496973,27.000000,3.000000,24.000000,26.000000,3.000000,23.000000,18.000000,2.000000,16.000000,...,494.476190,579.464966,467.369589,574.330134,20.037554,25.241336,19.303102,25.401508,18.278253,2021.0
2484,497037,41.000000,3.000000,38.000000,41.000000,3.000000,38.000000,32.000000,2.000000,30.000000,...,494.152976,567.612075,477.857143,568.977976,20.164000,25.356812,19.303102,25.401508,18.440992,2021.0
2485,497213,4.000000,2.000000,2.000000,4.000000,2.000000,2.000000,3.000000,1.000000,2.000000,...,498.876190,585.280442,467.369589,576.272842,20.037554,25.212765,19.303102,25.660271,18.278253,2021.0


In [15]:
academic_stats = numerical_data_unique_clean_imp.copy()
public_pnp_finances = pd.read_csv("datasets/Institutional Finances/F_F2_1415-1920_data.csv", low_memory=False)
public_pnp_finances = public_pnp_finances.groupby('unitid').mean().reset_index()
enrollment = pd.read_csv("datasets/Fall Enrollment/EFD_2015-2020_data.csv")
enrollment = enrollment.groupby('unitid').mean().reset_index()
caliber = pd.merge(academic_stats, public_pnp_finances, on='unitid', how='inner')
caliber = pd.merge(caliber, enrollment, on='unitid', how='inner')
caliber = caliber.drop(['year_x', 'year_y', 'year'], axis=1)
caliber

  public_pnp_finances = public_pnp_finances.groupby('unitid').mean().reset_index()


Unnamed: 0,unitid,applcn,applcnm,applcnw,admssn,admssnm,admssnw,enrlt,enrlm,enrlw,...,rrftin,rrftcta,ret_nmf,ret_pcf,rrptct,rrptex,rrptin,rrptcta,ret_nmp,ret_pcp
0,100937,3128.714286,1308.000000,1820.714286,1760.857143,792.714286,967.714286,325.428571,164.142857,161.285714,...,0.0,355.000000,286.166667,80.500000,,,,,,
1,101073,2203.500000,550.000000,1653.500000,835.500000,199.000000,627.000000,212.500000,156.500000,56.000000,...,0.0,129.666667,56.333333,42.666667,6.000000,0.0,0.0,6.000000,1.666667,18.666667
2,101189,1962.285714,836.428571,1125.714286,1139.000000,521.142857,617.857143,287.571429,159.714286,127.857143,...,0.0,296.333333,169.833333,57.333333,17.166667,0.0,0.0,17.166667,7.666667,44.666667
3,101365,114.285714,46.142857,68.142857,103.142857,43.428571,59.714286,69.142857,31.142857,38.000000,...,0.0,1.833333,0.833333,55.000000,0.666667,0.0,0.0,0.666667,0.333333,50.000000
4,101435,2077.428571,1184.714286,892.714286,1228.428571,687.000000,541.428571,257.285714,149.857143,107.428571,...,0.0,259.000000,169.000000,65.333333,0.000000,0.0,0.0,0.000000,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,494463,209.000000,0.000000,209.000000,190.333333,0.000000,190.333333,173.000000,0.000000,173.000000,...,0.0,148.000000,142.500000,96.500000,,,,,,
1334,494685,89.666667,39.333333,50.333333,71.000000,30.333333,40.666667,60.000000,26.666667,33.333333,...,0.0,43.000000,31.500000,71.000000,1.000000,0.0,0.0,1.000000,0.000000,0.000000
1335,495031,16.000000,16.000000,0.000000,9.500000,9.500000,0.000000,6.000000,6.000000,0.000000,...,0.0,7.000000,5.000000,71.000000,,,,,,
1336,495767,96003.500000,44889.000000,51048.000000,81905.500000,38428.500000,43421.000000,15831.500000,8429.000000,7388.500000,...,0.0,15527.000000,13495.000000,87.000000,181.000000,0.0,0.0,181.000000,105.000000,58.000000


In [16]:
caliber['']

['unitid',
 'applcn',
 'applcnm',
 'applcnw',
 'admssn',
 'admssnm',
 'admssnw',
 'enrlt',
 'enrlm',
 'enrlw',
 'enrlft',
 'enrlftm',
 'enrlftw',
 'enrlpt',
 'enrlptm',
 'enrlptw',
 'satpct',
 'actpct',
 'satvr25',
 'satvr75',
 'satmt25',
 'satmt75',
 'actcm25',
 'actcm75',
 'acten25',
 'acten75',
 'actmt25',
 'f2a01',
 'f2a02',
 'f2a03',
 'f2a03a',
 'f2a04',
 'f2a05',
 'f2a05a',
 'f2a05b',
 'f2a06',
 'f2a11',
 'f2a12',
 'f2a13',
 'f2a15',
 'f2a16',
 'f2a17',
 'f2a18',
 'f2a19',
 'f2a20',
 'f2b01',
 'f2b02',
 'f2b03',
 'f2b04',
 'f2b05',
 'f2b06',
 'f2b07',
 'f2c01',
 'f2c02',
 'f2c03',
 'f2c04',
 'f2c05',
 'f2c06',
 'f2c07',
 'f2c08',
 'f2c09',
 'f2d01',
 'f2d012',
 'f2d013',
 'f2d014',
 'f2d02',
 'f2d022',
 'f2d023',
 'f2d024',
 'f2d03',
 'f2d032',
 'f2d033',
 'f2d034',
 'f2d04',
 'f2d042',
 'f2d043',
 'f2d044',
 'f2d05',
 'f2d052',
 'f2d053',
 'f2d054',
 'f2d06',
 'f2d062',
 'f2d063',
 'f2d064',
 'f2d07',
 'f2d072',
 'f2d073',
 'f2d074',
 'f2d08',
 'f2d082',
 'f2d083',
 'f2d084',
 '