In [441]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.ensemble import RandomForestRegressor# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [304]:
def get_missing(df):
    missing = df.isna().sum()
    missing /= df.shape[0]
    missing *=100
    missing = missing.to_frame().rename(columns={0:'Percent Of Missing Values'})
    return missing


In [308]:
filepath = "datasets/Institutional Finances/F_F2_1415-1920_data.csv"
institutional_finance = pd.read_csv(filepath, index_col=0, low_memory= False)
missing = institutional_finance.isna().sum()
missing /= institutional_finance.shape[0]
missing *=100
missing = missing.to_frame().rename(columns={0:'Percent Of Missing Values'})
threshold = 30
institutional_finance_over_threshold_missing = institutional_finance[(missing[missing["Percent Of Missing Values"] < threshold]).index]
missing = institutional_finance_over_threshold_missing.isna().sum()
missing /= institutional_finance_over_threshold_missing.shape[0]
missing *=100
missing = missing.to_frame().rename(columns={0:'Percent Of Missing Values'})
# All missing entries with percent missing > 7 are verifiably 0 imputable
missing_keys = (missing[missing["Percent Of Missing Values"] > 7]).index
imputation_dict = {k: 0 for k in missing_keys}

cleaned_institutional_finance = institutional_finance_over_threshold_missing.fillna(imputation_dict)

# Remaining missing values are mean imputed -- very small fraction of rows in any case
num_cols = cleaned_institutional_finance.columns[cleaned_institutional_finance.dtypes.values != 'object']
cleaned_institutional_finance = institutional_finance_over_threshold_missing.fillna(value=cleaned_institutional_finance[num_cols].mean())
cleaned_institutional_finance = cleaned_institutional_finance[num_cols]
cleaned_institutional_finance.drop(columns= "f2d17", inplace = True)
sigma_threshold = 2
cleaned_institutional_finance_no_outliers = cleaned_institutional_finance[(np.abs(stats.zscore(cleaned_institutional_finance)) < sigma_threshold).all(axis=1)]

fnce_data_unique = cleaned_institutional_finance_no_outliers.groupby(['unitid']).mean().reset_index()


In [625]:
fnce_data_final = fnce_data_unique.drop(['year'], axis = 1)

In [613]:
train_features = fnce_data_final.drop(['unitid'], axis = 1)
model = PCA(n_components=train_features.shape[1]).fit(train_features)
X_pc = model.transform(train_features)

# number of components
n_pcs= model.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = train_features.columns
# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
expenses = ['unitid']
seen_categories = ['13']
for i in range(n_pcs):
    most_important_name = most_important_names[i]
    category = most_important_name[3:5]

    if 'e' in most_important_name and most_important_name != 'unitid' and most_important_name not in expenses and category not in seen_categories:
        # print(f"name: {most_important_name}, cat: {category}")
        expenses.append(most_important_name)
        seen_categories.append(category)
# expenses.append('unitid')
expenses = np.array(expenses)



In [432]:
expenses

array(['f2e011', 'f2e061', 'f2e137', 'f2e051', 'f2e071', 'f2e121',
       'f2e031', 'f2e021', 'f2e101', 'f2e081', 'f2e042', 'f2e092',
       'unitid'], dtype='<U6')

In [626]:
fnce_data_final_features = fnce_data_final[expenses]
for c in fnce_data_final_features.columns:
    if c != 'unitid':
        fnce_data_final_features[c] /= fnce_data_final['f2e131']

In [627]:
for c in fnce_data_final_features.columns:
    if c!= 'unitid':
        fnce_data_final_features = fnce_data_final_features[fnce_data_final_features[c] < 1]


In [628]:
fnce_data_final_features['unitid']

0       100690
1       100937
2       101073
3       101189
4       101365
         ...  
1837    495059
1840    495192
1843    495280
1844    495314
1845    495916
Name: unitid, Length: 1566, dtype: int64

In [53]:
# possible_unitids = pd.read_csv("datasets/IC_CLUSTERS_AND_MCA.csv")
# fnce_data = pd.merge(fnce_data, possible_unitids[['unitid']], on = 'unitid', how = 'inner').drop(['year'], axis = 1)


Unnamed: 0,unitid,f2a01,f2a02,f2a03,f2a03a,f2a04,f2a05,f2a05a,f2a05b,f2a06,...,f2e122,f2e131,f2e132,f2e133,f2e134,f2e135,f2e136,f2e137,f2h01,f2c10
0,100690,8.760923e+06,1.307510e+07,2.535914e+06,1.508292e+06,9.889659e+06,6.495275e+05,1.748072e+05,4.747203e+05,1.053919e+07,...,0.000000,7385488.0,3.845044e+06,586241.5,1.436238e+05,1.294402e+05,97263.0,2.583876e+06,1.748045e+05,3.622754e+06
1,100937,5.235002e+07,1.722822e+08,5.624674e+07,3.738436e+07,5.059348e+07,6.544197e+07,4.314847e+07,2.229350e+07,1.160355e+08,...,0.166667,46392031.5,1.820079e+07,4195823.5,4.335043e+06,3.221013e+06,2211438.5,1.422793e+07,5.656075e+07,2.076610e+07
2,101073,1.404627e+06,1.362275e+07,4.539956e+06,0.000000e+00,6.643169e+06,2.439628e+06,1.393872e+06,1.045756e+06,9.082797e+06,...,0.000000,10922549.0,2.847217e+06,664789.0,7.551130e+05,7.050923e+05,0.0,5.950338e+06,1.338949e+06,7.203375e+06
3,101189,2.055781e+07,8.991518e+07,4.270119e+07,3.311000e+07,2.931309e+07,1.790090e+07,1.260574e+07,5.295168e+06,4.721399e+07,...,0.000000,43958975.4,1.907465e+07,3718710.4,3.581797e+06,2.692246e+06,1182462.4,1.370911e+07,1.840186e+07,1.380247e+07
4,101365,0.000000e+00,8.337251e+06,3.799330e+06,0.000000e+00,4.537922e+06,0.000000e+00,0.000000e+00,0.000000e+00,4.537922e+06,...,42160.500000,5469523.5,2.198460e+06,411450.5,1.899160e+05,8.263625e+04,0.0,2.587061e+06,2.224978e+08,1.017367e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,495192,0.000000e+00,1.167080e+07,1.727437e+06,0.000000e+00,1.317455e+06,8.625911e+06,0.000000e+00,8.625911e+06,9.943366e+06,...,0.000000,7871324.0,2.925835e+06,722966.0,0.000000e+00,0.000000e+00,882.0,4.221641e+06,2.224978e+08,1.066775e+06
1674,495226,2.726673e+08,4.611618e+08,1.300061e+08,6.749763e+07,1.515092e+08,1.796464e+08,7.699077e+07,1.026557e+08,3.311557e+08,...,0.000000,50382.0,0.000000e+00,0.0,3.713200e+04,0.000000e+00,0.0,1.325000e+04,2.224978e+08,0.000000e+00
1675,495271,0.000000e+00,1.084680e+06,0.000000e+00,0.000000e+00,1.069680e+06,1.500000e+04,0.000000e+00,1.500000e+04,1.084680e+06,...,0.000000,1276977.0,8.107730e+05,66694.0,0.000000e+00,0.000000e+00,0.0,3.995100e+05,2.224978e+08,0.000000e+00
1676,495280,7.725700e+04,2.024860e+06,1.228340e+05,9.407800e+04,1.699223e+06,2.028030e+05,0.000000e+00,2.028030e+05,1.902026e+06,...,0.000000,1001533.0,5.477880e+05,24345.0,9.494600e+04,4.147800e+04,5151.0,2.878250e+05,2.224978e+08,5.052500e+04


In [320]:
fin_aid_orig = pd.read_csv("datasets/Student Financial Aid/SFA_1415-2021_data.csv")


In [342]:
target = 'uagrntn'

In [447]:
def test(X, y):
    # if np.count_nonzero(y == 0) > 0:
    #     lr = LinearRegression().fit(X, y)
    #     print(lr.score(X,y))
    # else:
    offset = int(X.shape[0] * 0.85)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    reg = LazyRegressor(verbose=1, ignore_warnings=True, custom_metric=None)
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    print(models)


In [629]:
fnce_data_final_features['unitid']

0       100690
1       100937
2       101073
3       101189
4       101365
         ...  
1837    495059
1840    495192
1843    495280
1844    495314
1845    495916
Name: unitid, Length: 1566, dtype: int64

In [630]:
total_students = pd.read_csv("datasets/12-Month Enrollment/EFFY_2015-2021_data.csv")
total_students = total_students.groupby('unitid').mean().reset_index()


In [631]:
total_students = total_students[['unitid', 'efytotlt']]

In [632]:
def test_set(datapath, target, second = None, div_by_student = True, div_by_expense = True):
    df = pd.read_csv(datapath, low_memory=False)
    print(target)
    df_subset = df[['unitid', target]]
    df_subset.dropna(inplace=True, axis = 0)
    print(df_subset.shape)
    df_subset = df_subset.groupby('unitid').mean().reset_index()
    if second != None:
        df_subset[target] /= df[second]
        df_subset = df_subset.drop([second], axis=1)
    if div_by_student:
        df_subset = pd.merge(df_subset, total_students, how='inner', on='unitid')
        df_subset[target] /= df_subset['efytotlt']
        df_subset = df_subset.drop(['efytotlt'], axis = 1)
    if div_by_expense:
        expenses = fnce_data_final[['f2e131', 'unitid']]
        df_subset = pd.merge(df_subset, expenses, on='unitid', how='inner')
        print(df_subset.columns)
        df_subset[target] /= df_subset['f2e131']
        print(df_subset.shape)
        df_subset = df_subset.drop(['f2e131'], axis = 1)
    print(df_subset['unitid'])
    merged = pd.merge(df_subset, fnce_data_final_features, how='inner', on='unitid')
    print(merged.shape)
    print(get_missing(merged))
    merged.dropna(inplace=True)
    X = merged.drop(['unitid', target], axis = 1)
    y = np.array(merged[target])
    # y[np.array(np.where(y ==0))] = 1e-9
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    test(X, y)


In [425]:
from sklearn.linear_model import LinearRegression

In [638]:
test_set("datasets/Admission and Test Scores/ADM_2015-2021_data.csv", 'actcm75', div_by_student=False, div_by_expense=False)

actcm75
(8460, 2)
0       100654
1       100663
2       100706
3       100724
4       100751
         ...  
1437    489937
1438    490179
1439    490805
1440    494603
1441    495767
Name: unitid, Length: 1442, dtype: int64
(739, 13)
         Percent Of Missing Values
unitid                        0.00
actcm75                       0.00
f2e011                        0.00
f2e061                        0.00
f2e051                        0.00
f2e071                        0.00
f2e121                        0.00
f2e031                        0.00
f2e021                        0.00
f2e101                        0.00
f2e081                        0.00
f2e042                        0.00
f2e092                        0.00
X shape: (739, 11), y shape: (739,)


  2%|▏         | 1/42 [00:00<00:05,  6.92it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.028111217828019508, 'Adjusted R-Squared': -0.0798764246355339, 'RMSE': 3.670230811104469, 'Time taken': 0.14453649520874023}


 17%|█▋        | 7/42 [00:00<00:01, 26.14it/s]

{'Model': 'BaggingRegressor', 'R-Squared': -0.04646536718215155, 'Adjusted R-Squared': -0.1627392968690573, 'RMSE': 3.808443543678789, 'Time taken': 0.05385899543762207}
{'Model': 'BayesianRidge', 'R-Squared': 0.0484153496304317, 'Adjusted R-Squared': -0.05731627818840934, 'RMSE': 3.631690299848817, 'Time taken': 0.008973121643066406}
{'Model': 'DecisionTreeRegressor', 'R-Squared': -0.44838022142113987, 'Adjusted R-Squared': -0.6093113571345998, 'RMSE': 4.480499001213069, 'Time taken': 0.01265096664428711}
{'Model': 'DummyRegressor', 'R-Squared': -0.002806179757577132, 'Adjusted R-Squared': -0.1142290886195303, 'RMSE': 3.7281518475838915, 'Time taken': 0.0080108642578125}
{'Model': 'ElasticNet', 'R-Squared': 0.09366308967426307, 'Adjusted R-Squared': -0.007041011473041037, 'RMSE': 3.544295517508235, 'Time taken': 0.005984783172607422}
{'Model': 'ElasticNetCV', 'R-Squared': 0.08515254367248248, 'Adjusted R-Squared': -0.016497173697241685, 'RMSE': 3.56089718634389, 'Time taken': 0.062644

 24%|██▍       | 10/42 [00:00<00:01, 17.20it/s]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.08711041232854444, 'Adjusted R-Squared': -0.014321764079395116, 'RMSE': 3.5570848004566744, 'Time taken': 0.24015164375305176}
{'Model': 'GammaRegressor', 'R-Squared': 0.0583247984721279, 'Adjusted R-Squared': -0.04630577947541359, 'RMSE': 3.6127312761844275, 'Time taken': 0.01592230796813965}
{'Model': 'GaussianProcessRegressor', 'R-Squared': -10.50557064781465, 'Adjusted R-Squared': -11.783967386460724, 'RMSE': 12.628133771308876, 'Time taken': 0.0748741626739502}


 29%|██▊       | 12/42 [00:00<00:02, 14.22it/s]

{'Model': 'GradientBoostingRegressor', 'R-Squared': -0.018815190482991406, 'Adjusted R-Squared': -0.13201687831443487, 'RMSE': 3.7577925222247464, 'Time taken': 0.12923765182495117}


 48%|████▊     | 20/42 [00:01<00:01, 11.95it/s]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': -0.008889723228129753, 'Adjusted R-Squared': -0.12098858136458857, 'RMSE': 3.739443202033164, 'Time taken': 0.7013707160949707}
{'Model': 'HuberRegressor', 'R-Squared': 0.09390294432839141, 'Adjusted R-Squared': -0.006774506301787442, 'RMSE': 3.543826502058635, 'Time taken': 0.02600693702697754}
{'Model': 'KNeighborsRegressor', 'R-Squared': 0.06071996857223916, 'Adjusted R-Squared': -0.04364447936417881, 'RMSE': 3.608133823015276, 'Time taken': 0.020943880081176758}
{'Model': 'KernelRidge', 'R-Squared': -47.16598841340872, 'Adjusted R-Squared': -52.517764903787466, 'RMSE': 25.837787997814534, 'Time taken': 0.03224825859069824}
{'Model': 'Lars', 'R-Squared': -18510.819961703302, 'Adjusted R-Squared': -20567.688846337005, 'RMSE': 506.53522374514785, 'Time taken': 0.013966560363769531}
{'Model': 'LarsCV', 'R-Squared': 0.05834515869166945, 'Adjusted R-Squared': -0.046283157009256115, 'RMSE': 3.612692220043092, 'Time taken': 0.02246785

 62%|██████▏   | 26/42 [00:02<00:01, 11.17it/s]

{'Model': 'MLPRegressor', 'R-Squared': -4.5128461637689385, 'Adjusted R-Squared': -5.125384626409932, 'RMSE': 8.741239686975373, 'Time taken': 0.5176196098327637}
{'Model': 'NuSVR', 'R-Squared': 0.2019479716983723, 'Adjusted R-Squared': 0.11327552410930253, 'RMSE': 3.3258349657661386, 'Time taken': 0.027965784072875977}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.05192073451667767, 'Adjusted R-Squared': -0.05342140609258039, 'RMSE': 3.624995037453435, 'Time taken': 0.007964134216308594}
{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.027334969473655546, 'Adjusted R-Squared': -0.08073892280704942, 'RMSE': 3.6716962267105564, 'Time taken': 0.01155543327331543}
{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.5449049325190396, 'Adjusted R-Squared': -0.7165610361322663, 'RMSE': 4.6273885958368535, 'Time taken': 0.007571220397949219}
{'Model': 'PoissonRegressor', 'R-Squared': 0.03780437152525462, 'Adjusted R-Squared': -0.06910625386082825, 'RMSE': 3.6518823838383887

 79%|███████▊  | 33/42 [00:12<00:05,  1.54it/s]

{'Model': 'QuantileRegressor', 'R-Squared': -0.00349514064855061, 'Adjusted R-Squared': -0.11499460072061174, 'RMSE': 3.7294323092783626, 'Time taken': 9.94324803352356}
{'Model': 'RANSACRegressor', 'R-Squared': -0.8751913106089879, 'Adjusted R-Squared': -1.0835459006766532, 'RMSE': 5.098094518134029, 'Time taken': 0.10663890838623047}


 83%|████████▎ | 35/42 [00:12<00:03,  1.79it/s]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.02073600619187943, 'Adjusted R-Squared': -0.0880711042312452, 'RMSE': 3.6841303284217846, 'Time taken': 0.3674442768096924}
{'Model': 'Ridge', 'R-Squared': 0.08805900088193719, 'Adjusted R-Squared': -0.01326777679784752, 'RMSE': 3.5552362270002384, 'Time taken': 0.007584571838378906}
{'Model': 'RidgeCV', 'R-Squared': 0.088059000882309, 'Adjusted R-Squared': -0.013267776797434516, 'RMSE': 3.5552362269995137, 'Time taken': 0.008014917373657227}
{'Model': 'SGDRegressor', 'R-Squared': 0.04669762491358265, 'Adjusted R-Squared': -0.05922486120713044, 'RMSE': 3.6349666408093237, 'Time taken': 0.008939027786254883}
{'Model': 'SVR', 'R-Squared': 0.20862727791736202, 'Adjusted R-Squared': 0.12069697546373559, 'RMSE': 3.3118879137157013, 'Time taken': 0.04217815399169922}
{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.09206959930018976, 'Adjusted R-Squared': -0.008811556333122583, 'RMSE': 3.54740987843921, 'Time taken': 0.00853681564331054

100%|██████████| 42/42 [00:13<00:00,  3.20it/s]

{'Model': 'XGBRegressor', 'R-Squared': -0.02799637578368519, 'Adjusted R-Squared': -0.14221819531520574, 'RMSE': 3.77468646458024, 'Time taken': 0.1538999080657959}
{'Model': 'LGBMRegressor', 'R-Squared': -0.001438603085229584, 'Adjusted R-Squared': -0.11270955898358848, 'RMSE': 3.72560884721756, 'Time taken': 0.15399909019470215}
                               Adjusted R-Squared  R-Squared   RMSE  \
Model                                                                 
SVR                                          0.12       0.21   3.31   
NuSVR                                        0.11       0.20   3.33   
HuberRegressor                              -0.01       0.09   3.54   
ElasticNet                                  -0.01       0.09   3.54   
LassoLarsCV                                 -0.01       0.09   3.55   
LinearRegression                            -0.01       0.09   3.55   
TransformedTargetRegressor                  -0.01       0.09   3.55   
RidgeCV                     




In [332]:
from lazypredict.Supervised import LazyRegressor

 24%|██▍       | 10/42 [00:00<00:02, 11.80it/s]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


100%|██████████| 42/42 [01:42<00:00,  2.43s/it]

                               Adjusted R-Squared  R-Squared     RMSE  \
Model                                                                   
ExtraTreesRegressor                          0.02       0.06   777.81   
MLPRegressor                                -0.06      -0.01   808.98   
KNeighborsRegressor                         -0.20      -0.14   859.60   
PassiveAggressiveRegressor                  -0.31      -0.25   897.80   
SVR                                         -0.31      -0.25   897.83   
LinearSVR                                   -0.32      -0.26   902.39   
QuantileRegressor                           -0.38      -0.32   923.15   
HuberRegressor                              -0.44      -0.37   942.14   
LGBMRegressor                               -0.46      -0.40   949.89   
HistGradientBoostingRegressor               -0.48      -0.41   955.60   
NuSVR                                       -0.54      -0.47   975.54   
PoissonRegressor                            -0.56  


