# MODULES AND CONSTANTS

### MODULES, LIBRARIES AND OTHER IMPORTS

In [85]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Dataset
from sklearn.datasets import make_regression


# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Learning
from sklearn.model_selection import train_test_split

# Testing and validation
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_percentage_error, mean_squared_error, r2_score, root_mean_squared_error

### CONSTANTS

In [86]:
REGRESSORS = [  LinearRegression(n_jobs=-1),RandomForestRegressor(n_jobs=-1), MLPRegressor(), 
                DecisionTreeRegressor(), KNeighborsRegressor(n_jobs=-1), Ridge(), Lasso(), ElasticNet()]


METRICS = [explained_variance_score, max_error, mean_absolute_percentage_error, mean_squared_error, r2_score, root_mean_squared_error]

# 1 - DATA CREATION

Generate a random synthetic dataset for a regression problem using the scikit-learn’s make_regression
function. Take your time to understand the construct parameters and their default values. Start with
at least 2000 samples and a fixed random state. <br>
X, y = make_regression(n_samples=2000, random_state=42)<br>
<b>Info:</b> a fixed random state helps you to reproduce the outcome. You have to use it whenever you
want your results, obtained from any random initialization, to be the same for different runs.
This functionality is available for every scikit-learn object that makes use of a random state.


In [None]:
def createDataset(nSamples:int, randomState:int, devStd:float=20):
    x, y = make_regression(n_samples=nSamples, random_state=randomState)
    return (x, y) if devStd<0 else (x + np.array([np.random.normal(0, devStd, x.shape[1]) for _ in range(nSamples)]), y)

# 2 - REGRESSION PIPELINE

Test the regression pipeline that you developed in Exercise 2.1. Pay enough attention to the differences with the previous exercise:
- How does your model handle the presence of multiple features?
- Is there any correlation among features? How does this impact the model performance?

In [88]:
def testMetrics(yPred:np, yTrue:np, metrics:list, model:str)->pd.DataFrame:
    return pd.DataFrame([metric(yTrue, yPred) for metric in metrics], columns=[model.split('(')[0]],
                        index=list(map(lambda x: str(x).split()[1], metrics))).T


def testPipeline(xTrain:np, xTest:np, yTrain:np, yTest:np, models:list[DecisionTreeRegressor]=REGRESSORS, 
                    metrics:list=METRICS)->pd.DataFrame:    
    df = pd.DataFrame()
    
    for model in models:
        if df.empty:
            df = testMetrics(model.fit(xTrain, yTrain).predict(xTest), yTest, metrics, str(model))
        else:
            df = pd.concat([ testMetrics(model.fit(xTrain, yTrain).predict(xTest), yTest, metrics, str(model)), df])
    
    return df

# 3 - NOISE ADDITION

Using the function constructor, make the problem harder for your regression model. Try to regenerate
the dataset adding some noise using the noise parameter and to increase or reduce the gap between
n_features and n_informative features.<br>
- How does the model behave in this case?
- Train a Linear Regressor and inspect the coefficients learned for the non-informative features.<br>
What do these values mean?

# MAIN FUNCTION

this is the main function of the program, it does:
<ol>
<li>Creates the initial dataset and partitions it</li>
<li>Tests the regression pipeline created</li>
<li>Adds noise to the data</li>
</ol>

In [92]:
def main()->None:
    x, y = createDataset(2000, 42) # 1
    
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20) # 1    
    
    display(testPipeline(xTrain, xTest, yTrain, yTest)) # 2
    
    
    x, y = createDataset(2000, 42, devStd=2) # 3
    
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20) # 3  
    
    display(testPipeline(xTrain, xTest, yTrain, yTest)) # 3

    
main()



Unnamed: 0,explained_variance_score,max_error,mean_absolute_percentage_error,mean_squared_error,r2_score,root_mean_squared_error
ElasticNet,0.879079,201.04,0.381349,4387.698,0.878997,66.2397
Lasso,0.999751,8.409493,0.02984306,9.027403,0.999751,3.004564
Ridge,1.0,0.3813445,0.0008312749,0.01705384,1.0,0.1305903
KNeighborsRegressor,0.253744,458.1725,1.465626,27066.39,0.25357,164.5187
DecisionTreeRegressor,0.291522,558.9876,1.962217,25786.07,0.288878,160.5804
MLPRegressor,0.987201,103.9017,0.1756924,467.3376,0.987112,21.61799
RandomForestRegressor,0.762263,310.5622,0.9962503,8620.62,0.762263,92.84729
LinearRegression,1.0,1.818989e-12,7.091241e-15,2.144279e-25,1.0,4.630636e-13




Unnamed: 0,explained_variance_score,max_error,mean_absolute_percentage_error,mean_squared_error,r2_score,root_mean_squared_error
ElasticNet,0.871261,238.5859,0.4707097,5512.999,0.871255,74.24957
Lasso,0.999719,10.47129,0.03460475,12.02966,0.999719,3.468381
Ridge,0.999999,0.4710403,0.001153265,0.02203966,0.999999,0.1484576
KNeighborsRegressor,0.215181,553.8134,2.021804,33640.81,0.214387,183.4143
DecisionTreeRegressor,0.408984,578.3239,2.586189,25330.46,0.408458,159.1554
MLPRegressor,0.981569,149.8926,0.2432414,805.6809,0.981185,28.38452
RandomForestRegressor,0.753146,410.4862,1.109461,10598.35,0.752497,102.9483
LinearRegression,1.0,1.648459e-12,8.031053e-15,2.370869e-25,1.0,4.869157e-13
