# Regression Algorithms for Muon Data

We will first check some linear regression algorithms (using the negative mean square error scorer).

### Linear Regression

The ML algorithms used for linear regression are: linear regression, Lasso and the ElasticNet.

In [None]:
from pandas import read_csv
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
np.random.seed(42) #Independent from run

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

import matplotlib.pyplot as plt

import os

In [None]:
from sklearn.model_selection import train_test_split

Load the csv into a pandas dataframe

In [None]:
filename = '../MuonPOGAnalysisTemplate/output/bxcut_org.csv'
dataframe = read_csv(filename)
array = dataframe.values
dataframe

The first option is divide the train/set in due different datasets.

In [None]:
X = array[:,0:len(dataframe.columns)-1]
Y = array[:,len(dataframe.columns)-1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

## The line / model
fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.set_xlabel("True Values")
ax.set_ylabel("Predictions")
plt.show()
del ax

The other option is to use a Kfold for cross_validation

In [None]:
#load dataset
filename = '../MuonPOGAnalysisTemplate/output/bxcut_org.csv'
dataframe = read_csv(filename)
array = dataframe.values
dataframe
X = array[:,0:len(dataframe.columns)-1]
Y = array[:,len(dataframe.columns)-1]

#prepare models
models = []
models.append(( 'LR' , LinearRegression()))
models.append(( 'LAR' , Lasso()))
models.append(( 'RIR' , Ridge()))
models.append(( 'EN' , ElasticNet()))
models.append(('RMR', RandomForestRegressor()))

#evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_squared_error'
for name,model in models:
    kfold = KFold(n_splits=15, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, np.sqrt((-1)*cv_results.mean()), np.sqrt(cv_results.std()))
    print(msg)
    predicted = cross_val_predict(model, X,Y, cv=kfold)
    fig, ax = plt.subplots()
    ax.scatter(Y, predicted, edgecolors=(0, 0, 0))
    ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    del ax

    # boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()


### Non linear Regression

The non linear regression algorithms are: KNeighbotsRegressor, DecisionTreeRegressor, SVR 

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [None]:
#load dataset
filename = '../MuonPOGAnalysisTemplate/output/bxcut_org.csv'
dataframe = read_csv(filename)
array = dataframe.values
dataframe
X = array[:,0:len(dataframe.columns)-1]
Y = array[:,len(dataframe.columns)-1]

#prepare models
models = []
models.append(( 'KNR' , KNeighborsRegressor()))
models.append(( 'DTR' , DecisionTreeRegressor()))
models.append(( 'SVR' , SVR()))
models.append(( 'RFR' , RandomForestRegressor()))

#evaluate each model in turn
results = []
names = []
scoring = 'mean_squared_error'
for name,model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    rmse = np.sqrt((-1)*cv_results.mean())
    msg = "%s: %f (%f)" % (name, rmse, np.sqrt(cv_results.std()))
    print(msg)
    predicted = cross_val_predict(model, X, Y, cv=kfold)
    fig, ax = plt.subplots()
    ax.scatter(Y, predicted, edgecolors=(0, 0, 0))
    ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    del ax

    # boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()



As we can see, there is no clear prediction of target label

## Try with stratified shuffle and split cross-validation

Another technique that we can use to split train/test dataset is the so called *Stratified Shuffle split* cross-validation.

The shuffle-split iterator will generate a user defined number of independent train / test dataset splits. Samples are first shuffled and then split into a pair of train and test sets.
The word **stratified** simply groups the dataset into similar distributed entries.

* The first idea is to merge the 5-case primitive and the 1-case primitive muons in the contiguous group (4 and 2 respectively).

In [None]:
# Label those below 2 as 2
# Label those above 4 as 4
dataframe["n_Primitive_category"] = np.ceil(dataframe["n_Primitive"])
dataframe["n_Primitive_category"].where(dataframe["n_Primitive"] < 5, 4.0, inplace=True)
dataframe["n_Primitive_category"].where(dataframe["n_Primitive"] > 1, 2.0, inplace=True)

The next step is to perform the stratified shuffle split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
dataframe["n_Primitive_category"]

In [None]:
split = StratifiedShuffleSplit(n_splits=8, test_size=0.3, random_state=42)
for train_index, test_index in split.split(dataframe, dataframe["n_Primitive_category"]):
    strat_train_set = dataframe.loc[train_index]
    strat_test_set = dataframe.loc[test_index]

It is necessary to remove the temporary column that is useless to train

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("n_Primitive_category", axis=1, inplace=True)

Here we can see how the new dataset is formed (both for training and test)

In [None]:
strat_train_set.head()

In [None]:
strat_test_set.head()

Now we divide (both training and test) into predictors and labels (supervised ML)

In [None]:
X_train = strat_train_set.drop("genParticle.pt",axis=1)
y_train = strat_train_set["genParticle.pt"].copy()

In [None]:
X_test = strat_test_set.drop("genParticle.pt",axis=1)
y_test = strat_test_set["genParticle.pt"].copy()

And we apply the ML regression algorithms in sequence on a loop:

In [None]:

models = []
models.append(( 'LR' , LinearRegression()))
models.append(( 'LAR' , Lasso()))
models.append(( 'RIR' , Ridge()))
models.append(( 'EN' , ElasticNet()))
models.append(( 'DTR' , DecisionTreeRegressor(random_state=42)))
models.append(( 'KNR' , KNeighborsRegressor()))
models.append(( 'SVR' , SVR()))
models.append(( 'RFR' , RandomForestRegressor()))

#evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_squared_error'
for name,model in models:
    
    M_model = model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    
    lin_mse = mean_squared_error(y_test, prediction)
    lin_rmse = np.sqrt(lin_mse)
    msg = "%s: %f" % (name, lin_rmse)
    print(msg)
    fig, ax = plt.subplots()
    ax.scatter(y_test, prediction, edgecolors=(0, 0, 0))
    ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured P_t(GeV)')
    ax.set_ylabel('Predicted P_t(GeV)')
    ax.set_title(name)
    plt.show()
    del ax
    if (name == 'RFR'):
        d = {'col1': y_test, 'col2': prediction}
        compare = pd.DataFrame(data=d)

In [None]:
compare