# Random Forest Regressor using SKLearn

In [None]:
from pandas import read_csv
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
np.random.seed(42) #Independent from run
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [None]:
muon_dataframe = pd.read_csv('../MuonPOGAnalysisTemplate/output/bxcut_org.csv')
array = muon_dataframe.values

In [None]:
X = array[:,0:len(muon_dataframe.columns)-1]
Y = array[:,len(muon_dataframe.columns)-1]

### Train test splitting (70-30%):

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7)

In [None]:
rfr = RandomForestRegressor()
model = rfr.fit(X_train, y_train)
predictions = rfr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, predictions)))
## The line / model
fig, ax = plt.subplots()
ax.scatter(y_test, predictions,edgecolors=(0, 0, 0))
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.set_xlabel("True Values")
ax.set_ylabel("Predictions")
plt.show()
del ax

### Using k-fold cross validation

In [None]:
kfold = KFold(n_splits=8, random_state=7)

In [None]:
cv_results = cross_val_score(RandomForestRegressor(), X, Y, cv=kfold, scoring='neg_mean_squared_error')

In [None]:
msg = "%s: %f (%f)" % ("RandomForestRegressor", np.sqrt((-1)*cv_results.mean()), np.sqrt(cv_results.std()))

In [None]:
print(msg)

In [None]:
predicted = cross_val_predict(RandomForestRegressor(), X, Y, cv=kfold)
fig, ax = plt.subplots()
ax.scatter(Y, predicted, edgecolors=(0, 0, 0))
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

### Using Stratified Shuffle Split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(muon_dataframe,muon_dataframe["n_Primitive"]):
    strat_train_set = muon_dataframe.loc[train_index]
    strat_test_set = muon_dataframe.loc[test_index]

In [None]:
X_train = strat_train_set.drop("genParticle.pt",axis=1)
y_train = strat_train_set["genParticle.pt"].copy()

In [None]:
X_test = strat_test_set.drop("genParticle.pt",axis=1)
y_test = strat_test_set["genParticle.pt"].copy()

In [None]:
model = RandomForestRegressor()
M_model = model.fit(X_train,y_train)
prediction = model.predict(X_test)

In [None]:
lin_mse = mean_squared_error(y_test, prediction)
lin_rmse = np.sqrt(lin_mse)
msg = "%s: %f" % ("RFR", lin_rmse)
print(msg)
fig, ax = plt.subplots()
ax.scatter(y_test, prediction, edgecolors=(0, 0, 0))
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.set_xlabel('Measured P_t(GeV)')
ax.set_ylabel('Predicted P_t(GeV)')
ax.set_title("random forest regressor")
plt.show()
del ax