In [5]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import SelectFromModel

In [6]:
kf = KFold(n_splits = 5)

def rmse(pred, true):
    err = pred - true
    return np.sqrt(np.dot(err, err) / true.shape[0])

In [3]:
df = pd.read_csv(r'../data/transformed_data.csv')

label_encoder = LabelEncoder()
label_encoder.fit(df['StateCode'])

X = df.drop(columns=['TotalAmountofAssistance'])
y = df['TotalAmountofAssistance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

X_train[:, 0] = label_encoder.transform(X_train[:, 0])
X_test[:, 0] = label_encoder.transform(X_test[:, 0])

In [4]:
rf = RandomForestRegressor(n_estimators=1000, max_features=1)
rf.fit(X_train, y_train)

sel = SelectFromModel(rf)

print(rmse(rf.predict(X_train), y_train))
print('*********')
print(rmse(rf.predict(X_test), y_test))

print('*********')

important_vars = sel.get_support()
print(df.columns.drop('TotalAmountofAssistance')[important_vars])

15999041.325932425
*********
24971011.138569973
*********
Index(['CO2 Emissions (Mmt)', 'TotalNumberofInvestments', 'GETCB', 'HYTCB',
       'NCPRB', 'NUETB', 'REPRB', 'SOTCB', 'TEPRB', 'TETCB', 'WDEXB', 'WDPRB',
       'WDTCB', 'WSTCB', 'WWPRB', 'WYTCB'],
      dtype='object')


16386501.980113758
***
34830858.23750078
***
Index(['TotalNumberofInvestments', 'BFFDB', 'CLPRB', 'CLPRP', 'GETCB', 'NCPRB',
       'NGMPB', 'NGMPP', 'NUETB', 'SOTCB', 'TETCB', 'WDEXB'],