# 0 - MODULES AND CONSTANTS

### MODULES, IMPORTS AND LIBRARIES

In [69]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Mathematical imports
from math import sin

# Testing and validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_percentage_error, mean_squared_error, r2_score, root_mean_squared_error


# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet



### CONSTANTS

This are the functions to be generated
<br>
- $f1(x) = x · sin(x) + 2x $
<br>
- $f2(x) = 10 sin(x) + x^2$
<br>
- $f3(x) = sign(x)(x^2 + 300) + 20 sin(x)$

In [70]:
# Mathematical function and number generating function
FUNCTIONS = {   'f1':(lambda x: x*sin(x)+2*x), 
                'f2':(lambda x: 10*sin(x)+x**2), 
                'f3':(lambda x: (1 if x >= 0 else -1)*(x**2+300) + 20*sin(x))}

OTHER_FUNCTIONS = { 'sin(x)':(lambda x: sin(x)), 
                    'x*sin(x)' : (lambda x: x*sin(x)), 
                    'x**2':(lambda x : x**2), 
                    'sign(x)*x**2':(lambda x: x**2 if x>=0 else -x**2)}

REGRESSORS = [  LinearRegression(),RandomForestRegressor(), MLPRegressor(), 
                DecisionTreeRegressor(), KNeighborsRegressor(), Ridge(), Lasso(), ElasticNet()]


METRICS = [explained_variance_score, max_error, mean_absolute_percentage_error, mean_squared_error, r2_score, root_mean_squared_error]

ADD_NOISE = True

# DATA GENERATION

generates the data for each function and returns a dataframe

In [71]:
def generateData(Xmin:int, Xmax:int, totN:int, functionsDict:dict=FUNCTIONS)->pd.DataFrame:
    x = np.linspace(Xmin, Xmax, totN) 
    if ADD_NOISE:
        x = x + np.random.normal(0, 50, size=x.size)

    
    return pd.concat([  pd.Series(x, name='x'),
                        pd.DataFrame([[functionsDict[function](point) for function in functionsDict] for point in x], columns=functionsDict.keys())], 
                    axis=1)

adds the behavior of some non linear functions to x df for a better approximation of f(x)

In [72]:
def addOtherNonLinearFunction(df:pd.DataFrame, xMin:int, xMax:int, totN:int, functions:dict=OTHER_FUNCTIONS)->pd.DataFrame:
    return pd.concat([ df, pd.DataFrame([[functions[function](point) for function in functions] for point in np.linspace(xMin, xMax, totN)], columns=functions.keys())], 
                    axis=1)

# PLOT FUNCTIONS

Can either plot a single function or all functions at once <br>
Draw and inspect the shape of the function. Which regression model of those you know could achieve
better performance?

In [73]:
def plotFunction(x:pd.Series, y:pd.DataFrame|pd.Series, names:list[str]=None, title:str=None)->None:
    fig, ax = plt.subplots(1, 1, figsize=(8,8))
    ax.set_title(title or 'functions')
    ax.plot(x, y)
    ax.legend(labels=names or y.columns)
    plt.show()

# REGRESSORS

Frame now a regression task to your generated data. Start from the ordinary least squares Linear
Regression. <br>

Fit each model to the training data and predict the function value for each test point

In [74]:
def testLinearRegressor(df:pd.Series, xMin:int, xMax:int, totN:int, functionTested:str, 
                func, nonLinearFunc:dict=OTHER_FUNCTIONS, 
                regressor:LinearRegression=LinearRegression(n_jobs=-1, fit_intercept=True))->None:
    xTest = np.linspace(xMin, xMax, totN)
    
    yPred = (regressor.fit(df[['x']+list(nonLinearFunc.keys())], np.reshape(df[functionTested], (-1, 1)))
            .predict(addOtherNonLinearFunction(pd.Series(xTest, name='x'), xMin, xMax, totN, OTHER_FUNCTIONS)))
    
    plotFunction(xTest, pd.concat([pd.Series(yPred[:, 0]), pd.Series(list(map(lambda x: func(x), xTest)))],axis=1), 
                 names=['yTrue', 'yPred'], title='Predicted vs True')

Then, choose additional models which you believe could outperform linear regression to approximate the function.

<br> Info: There are many regression models in scikit-learn, other than the ones that you should already be familiar with (e.g. LinearRegression, Ridge, SVR). Part of the models that you adopted
for classification have their regression counterparts, such as MLPRegressor and RandomForestRegressor.
<br>


In [75]:
def computeOtherRegressor(df:pd.DataFrame, xMin:int, xMax:int, totN:int, functionTested:str, 
                nonLinearFunc:dict=OTHER_FUNCTIONS, regressor:DecisionTreeRegressor=DecisionTreeRegressor()) -> None:
    
    xTest = np.linspace(xMin, xMax, totN)

    return  regressor.fit(
        df[['x'] + list(nonLinearFunc.keys())],
        np.reshape(df[functionTested], (-1, 1)).ravel(),
    ).predict(
        addOtherNonLinearFunction(
            pd.Series(xTest, name='x'), xMin, xMax, totN, OTHER_FUNCTIONS
        )
    )        

# METRIC COMPUTATION   

computes a series of known metrics

In [76]:
def computeMetrics(yTrue:pd.Series, yPred:pd.Series, metrics:list, index:str)->pd.DataFrame:
    return pd.DataFrame([metric(yTrue, yPred) for metric in metrics], columns=[index[:-2]], 
                        index=list(map(lambda x: str(x).split()[1], metrics))).T

Tests the regressor given

In [None]:
def testOtherRegressor(df:pd.DataFrame, xMin:int, xMax:int, totN:int, functionTested:str='f1', func=FUNCTIONS['f1'],
                nonLinearFunc:dict=OTHER_FUNCTIONS, regressors:list[DecisionTreeRegressor]=[DecisionTreeRegressor()]) -> pd.DataFrame:
    
    data = pd.DataFrame()
    
    for regr in regressors:
        if len(data):
            data = pd.concat([computeMetrics(pd.Series(list(map(lambda y:func(y), np.linspace(xMin, xMax, totN)))),
                    computeOtherRegressor(df, xMin, xMax, totN, functionTested, nonLinearFunc, regr), METRICS, str(regr)), data], axis=0)
        else:
            data = computeMetrics(pd.Series(list(map(lambda y:func(y), np.linspace(xMin, xMax, totN)))),
                    computeOtherRegressor(df, xMin, xMax, totN, functionTested, nonLinearFunc, regr), METRICS, str(regr))
    
    return data

# MAIN FUNCTION

### Main function of the program
it does:
<ol>
<li>The generation of the data</li>
<li>Plots the functions</li>
<li>Trains and builds the regression model, after adding other non linear functions for a better approximation</li>
</ol>

In [None]:
def main()->None:
    df = generateData(-20, 20, 100, FUNCTIONS) # 1  
    
    # plotFunction(df['x'], df[df.columns.difference(['x'])]) # 2    
    
    df = addOtherNonLinearFunction(df, -20, 20, 100, OTHER_FUNCTIONS)
    
    # testLinearRegressor(df, 20, 40, 100, 'f1', FUNCTIONS['f1'], OTHER_FUNCTIONS) # 3
    
    display(testOtherRegressor(df, 20, 40, 100, 'f1', FUNCTIONS['f1'], OTHER_FUNCTIONS, REGRESSORS))
        
    
main()



Unnamed: 0,explained_variance_score,max_error,mean_absolute_percentage_error,mean_squared_error,r2_score,root_mean_squared_error
ElasticNet,0.881689,24.052292,0.177755,105.755015,0.84151,10.283726
Lasso,0.880991,24.110673,0.178146,106.15519,0.840911,10.303164
Ridge,0.856223,25.693865,0.193776,121.620573,0.817733,11.028172
KNeighborsRegressor,0.0,48.761395,0.54224,776.169602,-0.163206,27.859821
DecisionTreeRegressor,-2.209149,151.9701,0.416924,2284.02899,-2.422958,47.791516
MLPRegressor,0.93484,23.021806,0.123691,123.952913,0.814238,11.133414
RandomForestRegressor,0.543335,47.082442,0.356969,425.673195,0.362065,20.631849
LinearRegression,0.85574,25.721647,0.194064,121.918224,0.817287,11.041659
