In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import LeaveOneOut
from sklearn import preprocessing

In [3]:
# read data
data=pd.read_csv("Model I.csv") # this was the dataset we utilized for COPD
data.head()
X = data.iloc[:,1:-1]
y = data.iloc[:,-1] 

In [4]:
X.head()

Unnamed: 0,Adults Reporting Exercise in the Past 30 Days,Adults Reporting Secondhand Smoke at Home,Adults with Asthma in the Past 12 Months,Adults with High Cholesterol,Adults without a Smoke-Free Home Policy,Air Toxics Concentrations- Average Benzene Concentrations,Air Toxics Concentrations- Average Formaldehyde Concentrations,Asthma Hospitalizations (Adults),Asthma Hospitalizations (Children 0 to 4 Yrs Old),Asthma Hospitalizations (Children 5 to 17 Yrs Old),...,Boiler Emissions- Total PM2.5 Emissions,Cigarette Smoking among Adults,Fine Particulate Matter (PM2.5),Neighborhood Race/Ethnicity - Non Hispanic Black,Neighborhood Race/Ethnicity - Non Hispanic White,Nitric Oxide (NO),Nitrogen Dioxide (NO2),Overweight or Obese Adults,predicted annual average fine particulate matter <2.5 microns,Walkability Index (Infrastructure)
0,75.333333,2.35,2.616667,30.6875,24.52,1.2,1.9,21.855556,25.866667,8.75,...,0.2,12.166667,9.109333,1,2,20.868667,18.537333,48.116667,9.147342,-2.12
1,74.933333,9.1,5.45,26.1375,29.16,1.3,2.0,395.105556,224.033333,166.216667,...,0.1,18.266667,9.778,3,1,22.672,24.560667,63.483333,9.722558,1.64
2,71.883333,4.85,1.05,34.8375,29.16,1.6,2.0,58.927778,17.591667,9.716667,...,0.2,18.216667,9.210667,1,3,18.699333,20.423333,52.4,9.208983,0.24
3,70.233333,3.775,2.866667,33.825,26.24,1.5,2.0,103.816667,31.433333,20.341667,...,0.2,15.483333,9.457333,1,3,20.108667,22.531333,55.083333,9.56024,0.17
4,75.216667,4.225,3.166667,28.5125,20.38,1.1,1.8,126.544444,87.758333,51.583333,...,0.0,12.85,9.118,3,2,18.268,17.801333,65.016667,8.75116,-0.99


In [4]:
# scale
# make each column have mean = 0 and std dev = 1
X_scaled = preprocessing.scale(X)
y_scaled = preprocessing.scale(y)

In [5]:
# Leave One Out splits
loo = LeaveOneOut()
loo.get_n_splits(X) #there were 42 data records at the UHF scale

42

In [6]:
IS_acc = []
y_predict = []
y_true = []
best_param = []
for train_index, test_index in loo.split(X_scaled):
    # test/train splits
    X_train, y_train = X_scaled[train_index], y_scaled[train_index]
    X_test, y_test = X_scaled[test_index], y_scaled[test_index]
    # parameter selection and model fitting
    param_grid ={'max_depth':range(1,16)}
    rf=RandomForestRegressor(n_estimators=100)
    gr=GridSearchCV(rf,param_grid=param_grid)
    rs=gr.fit(X_train,y_train)
    y_predict.append(rs.predict(X_test))
    y_true.append(y_test)
    IS_acc.append(rs.score(X_train, y_train))
    best_param.append(rs.best_params_)
    #OS_acc.append(1-(y_predict-y_test)**2).mean()/y_test.var()
    #OS_acc.append(rs.score(X_test, y_test))



In [7]:
best_param # optimal max depth for each of 42 iterations

[{'max_depth': 12},
 {'max_depth': 14},
 {'max_depth': 15},
 {'max_depth': 8},
 {'max_depth': 11},
 {'max_depth': 4},
 {'max_depth': 11},
 {'max_depth': 10},
 {'max_depth': 3},
 {'max_depth': 3},
 {'max_depth': 15},
 {'max_depth': 11},
 {'max_depth': 8},
 {'max_depth': 5},
 {'max_depth': 13},
 {'max_depth': 4},
 {'max_depth': 8},
 {'max_depth': 6},
 {'max_depth': 11},
 {'max_depth': 5},
 {'max_depth': 9},
 {'max_depth': 15},
 {'max_depth': 15},
 {'max_depth': 13},
 {'max_depth': 12},
 {'max_depth': 13},
 {'max_depth': 4},
 {'max_depth': 11},
 {'max_depth': 11},
 {'max_depth': 12},
 {'max_depth': 15},
 {'max_depth': 6},
 {'max_depth': 13},
 {'max_depth': 10},
 {'max_depth': 7},
 {'max_depth': 14},
 {'max_depth': 8},
 {'max_depth': 3},
 {'max_depth': 5},
 {'max_depth': 13},
 {'max_depth': 10},
 {'max_depth': 8}]

In [8]:
IS_acc #IS accuracy for each of 42 iterations

[0.9254325398501158,
 0.9164920297171758,
 0.9276666043240561,
 0.9191804819139058,
 0.9267253518150671,
 0.8942270483908303,
 0.9207647389801833,
 0.9258692493025359,
 0.8523736667175167,
 0.8475785110881771,
 0.9220246454847584,
 0.9098041543368902,
 0.9229569836989082,
 0.9083423489436807,
 0.9170452902324788,
 0.8906266444703156,
 0.9178375665545059,
 0.9119784224544294,
 0.9053975628883218,
 0.9048052190176973,
 0.912365951202494,
 0.9192784557095949,
 0.9108632859731048,
 0.9195055812790385,
 0.9179695965961339,
 0.9185876972493788,
 0.8963034371730952,
 0.9218509707175728,
 0.9257938547957495,
 0.9131685754755844,
 0.9146664060400508,
 0.915554003860725,
 0.920958691885778,
 0.9006656598412031,
 0.924313037609158,
 0.9082422232338608,
 0.9202729521390718,
 0.8389010824765044,
 0.9132641546814301,
 0.9286136942650894,
 0.9236103780359688,
 0.918092611806934]

In [9]:
# convert arrays to dataframes for ease of next step
y_predict_df = pd.DataFrame(y_predict)
y_true_df = pd.DataFrame(y_true)

In [10]:
OS_acc = 1-((y_predict_df-y_true_df)**2).mean()/y_true_df.var()
OS_acc #out of sample accuracy calculated from the 42 y predictions and true y test vals

0    0.396796
dtype: float64

In [11]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true_df, y_predict_df)

0.6179158610978142

In [12]:
from sklearn.metrics import r2_score
r2_score(y_true_df, y_predict_df)

0.3820841389021856

 ### Linear Regression

In [15]:
from sklearn import linear_model
IS_acc_lm = []
y_predict_lm = []
y_true_lm = []
for train_index, test_index in loo.split(X_scaled):
    X_train, y_train = X_scaled[train_index], y_scaled[train_index]
    X_test, y_test = X_scaled[test_index], y_scaled[test_index]
    lm=linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    y_predict_lm.append(lm.predict(X_test))
    y_true_lm.append(y_test)
    IS_acc_lm.append(lm.score(X_train, y_train))

In [16]:
IS_acc_lm #IS accuracy for each of 42 iterations

[0.8474330114593319,
 0.8285587985921385,
 0.8574734492816939,
 0.8385352468811954,
 0.8401900421894353,
 0.8397428474514042,
 0.834704834768823,
 0.8150713841544321,
 0.8556484310961433,
 0.8390491536883884,
 0.8414238942458394,
 0.8462929182601697,
 0.8486790391466801,
 0.8420616152390732,
 0.8392426304134655,
 0.834667052201039,
 0.8324568103367017,
 0.834812741046002,
 0.8267771122030305,
 0.8319800253081491,
 0.869211715561646,
 0.8373587648000392,
 0.8417394175048604,
 0.8498808281734516,
 0.8291325855781508,
 0.8504363853721054,
 0.8401503525989091,
 0.8437231695197049,
 0.8454339938272561,
 0.8463898112583027,
 0.8428875480761026,
 0.8403933215715108,
 0.8389865379913681,
 0.8385302933036846,
 0.8342690657717727,
 0.8383978388266702,
 0.8354999445763541,
 0.8400713091795617,
 0.8400597292932122,
 0.8883417615985021,
 0.8403863495008269,
 0.8511303270293427]

In [101]:
y_predict_lm_df = pd.DataFrame(y_predict_lm)
y_true_lm_df = pd.DataFrame(y_true_lm)
1-((y_predict_lm_df-y_true_lm_df)**2).mean()/y_true_lm_df.var() #out of sample accuracy calculated from the 42 y predictions and true y test vals

0    0.388084
dtype: float64