### Exercício 5

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')


file_path = '/home/pedro_loureiro/Aprendizagem/Proj3_Aprendizagem/parkinsons.csv'
data = pd.read_csv(file_path)

X = data.drop(columns=['target'])
y = data['target']

models = {
    'Linear Regression': LinearRegression(),
    'MLP (No Activation)': MLPRegressor(hidden_layer_sizes=(10, 10), activation='identity', random_state=0, max_iter=200),
    'MLP (ReLU Activation)': MLPRegressor(hidden_layer_sizes=(10, 10), activation='relu', random_state=0, max_iter=200)
}

mae_results = {model: [] for model in models}


for i in range(1, 11):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    # Train and evaluate each model
    for model_name, model in models.items():
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mae_results[model_name].append(mae)

mae_df = pd.DataFrame(mae_results)

plt.figure(figsize=(10, 6))
sns.boxplot(data=mae_df)
plt.title('Test MAE of Each Model over 10 Runs')
plt.ylabel('Mean Absolute Error (MAE)')
plt.show()


### Exercício 6

An MLP without activation functions behaves exactly like a Linear Regression (as we can see by analyzing the experimental results), limiting its ability to model non-linear patterns. Using activation functions in neural networks is important because it introduces non-linearity, allowing the MLP to learn more complex patterns and, consequently, perform better on non-linear tasks.

If we were to use an MLP with activation functions like ReLU, we would expect a reduction in error (MAE), as the model would be better able to capture the complexity of the data.

### Exercício 7

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

data = pd.read_csv('/home/pedro_loureiro/Aprendizagem/Proj3_Aprendizagem/parkinsons.csv')
X = data.drop(columns=['target'])
y = data['target']

param_grid = {
    'alpha': [0.0001, 0.001, 0.01],  
    'learning_rate_init': [0.001, 0.01, 0.1],  
    'batch_size': [32, 64, 128],
}

mlp = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=200, random_state=0)

grid_search = GridSearchCV(mlp, param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X, y)  


results = pd.DataFrame(grid_search.cv_results_)
results['mean_test_score'] = -results['mean_test_score']  

alpha_vals = results['param_alpha'].astype(float)
learning_rate_vals = results['param_learning_rate_init'].astype(float)
batch_size_vals = results['param_batch_size'].astype(float)
mae_vals = results['mean_test_score']

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(alpha_vals, learning_rate_vals, batch_size_vals, c=mae_vals, cmap='viridis', marker='o')
ax.set_xlabel('L2 penalty (Alpha)')
ax.set_ylabel('Learning Rate')
ax.set_zlabel('Batch Size')
ax.set_title('MAE for each combination of hyperparameter')
fig.colorbar(sc, ax=ax, label='MAE')

plt.show()

best_params = grid_search.best_params_
best_score = -grid_search.best_score_
print("Best combination of hyperparameter:", best_params)


1. L2 Penalty (Alpha):

Low (0.0001): Flexible but with a higher risk of overfitting.

Moderate (0.001): A balance between flexibility and regularization.

High (0.01): Reduces overfitting but may lead to underfitting.

2. Learning Rate:

Low (0.001): More precise convergence but slower.

Moderate (0.01): A good balance between speed and stability.

High (0.1): Fast convergence but with a risk of instability.

3. Batch Size:

Small (32): More frequent updates but higher variance in updates.

Moderate (64): A good balance between updates and stability.

Large (128): Stable convergence but with fewer updates.

    Best Combination:
    
Alpha: 0.01; Batch Size: 32; Learning Rate: 0.1

This combination offers a good balance between regularization, stability in learning, and frequent updates.