4.1.1 INICIAL DATA VISUALIZATION

In [29]:
import numpy as np
import pandas as pd

df = pd.read_csv('X_train_y_train.csv')
df_np = df.to_numpy()
X_train = df_np[:, :5]
y_train = df_np[:, -1]
df

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Target (y_train)
0,243.603602,320.129329,263.887099,138.347154,187.518604,-2.737384
1,0.807288,-0.036818,627.205787,149.999110,109.295966,-0.410969
2,-89.012773,1.331870,157.805750,145.964094,104.658223,14.833533
3,-300.950338,-13.833076,-83.805788,127.188902,89.762707,3.960804
4,375.153738,158.006206,-272.035822,111.853997,161.561648,10.400393
...,...,...,...,...,...,...
195,242.813582,244.079078,279.983609,188.535738,175.163235,-2.356786
196,97.201919,128.383706,128.146943,169.641872,144.133568,-0.679252
197,399.021837,132.958737,133.830612,135.274452,158.813603,-2.202706
198,232.785364,176.828573,49.615762,148.324708,157.870006,9.637862


4.1.2 OUTLIERS REMOVAL

In [30]:
import numpy as np
from sklearn.linear_model import LinearRegression

num_outliers_to_remove= 50
cleaned_x_train = X_train
cleaned_y_train = y_train
num_outliers_removed = 0

for i in range(num_outliers_to_remove):

    lin = LinearRegression().fit(cleaned_x_train, cleaned_y_train)
    
    y_pred = lin.predict(cleaned_x_train)
    squared_errors = (y_pred - cleaned_y_train) ** 2

    idx = np.argmax(squared_errors)
    
    cleaned_x_train = np.delete(cleaned_x_train, idx, axis=0)
    cleaned_y_train = np.delete(cleaned_y_train, idx, axis=0)
    
    num_outliers_removed += 1
    
print(f"Número de outliers removidos: {num_outliers_removed}")
print(f"Dimensão dos dados limpos: {cleaned_x_train.shape}")
print(f"Dimensão dos dados limpos: {cleaned_y_train.shape}")


Número de outliers removidos: 50
Dimensão dos dados limpos: (150, 5)
Dimensão dos dados limpos: (150,)


4.1.3 DATA EVALUATION

In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np

def evaluate_model(model, cleaned_x_train, cleaned_y_train, y_pred, k, cv_flag, model_name, evaluation_df=None):

    if evaluation_df is None:
        evaluation_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE', 'R²', 'SSE'])

    if not cv_flag:
        cross_validation = cross_validate(
            model, 
            cleaned_x_train, 
            cleaned_y_train, 
            cv=k, 
            scoring=('neg_mean_squared_error', 'r2', 'neg_mean_absolute_error'), 
            return_train_score=True
        )
        
        mae = abs(cross_validation['test_neg_mean_absolute_error'].mean())
        mse = abs(cross_validation['test_neg_mean_squared_error'].mean()) 
        r2 = cross_validation['train_r2'].mean()
        
        n_samples = len(cleaned_x_train) // k
        sse = mse * n_samples * k

    else:
        mse = mean_squared_error(cleaned_y_train, y_pred)
        mae = mean_absolute_error(cleaned_y_train, y_pred)
        r2 = r2_score(cleaned_y_train, y_pred)
        sse = np.sum((cleaned_y_train - y_pred) ** 2)

    new_row = pd.DataFrame({
        'Model': [model_name],
        'MSE': [mse],
        'MAE': [mae],
        'R²': [r2],
        'SSE': [sse]
    })

    if not cv_flag:
        evaluation_df = new_row
    else:
        evaluation_df = pd.concat([evaluation_df, new_row], ignore_index=True)

    return evaluation_df


4.1.4 MODELS

    4.1.4.1 LINEAR REGRESSION

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

k = 20

regr_best = LinearRegression().fit(cleaned_x_train, cleaned_y_train)

y_pred_regr_best = 0 # not used in this case

df_regr_best = evaluate_model(regr_best, cleaned_x_train, cleaned_y_train.ravel(), y_pred_regr_best, k, False, 'l-regr')

    4.1.3.2 Lasso Cross Validation

In [33]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV

k = 10 
alpha_values = np.arange(0.00001, 1, 0.0001)

lasso_cv_best = LassoCV(alphas=alpha_values, cv=k).fit(cleaned_x_train, cleaned_y_train.ravel())
y_pred_lasso_cv_best = lasso_cv_best.predict(cleaned_x_train)

df_lasso_cv_best = evaluate_model(lasso_cv_best, cleaned_x_train, cleaned_y_train.ravel(), y_pred_lasso_cv_best, k, True, "LassoCV",df_regr_best)

print('Best alpha LASSOCV: ', lasso_cv_best.alpha_)

Best alpha LASSOCV:  1e-05


    4.1.3.3 Lasso Lars Cross Validation

In [34]:
from sklearn.linear_model import LassoLarsCV

k = 10 

lasso_lars_cv_best = LassoLarsCV(cv = k).fit(cleaned_x_train, cleaned_y_train.ravel())
y_pred_lasso_lars_cv_best = lasso_lars_cv_best.predict(cleaned_x_train)

df_lasso_lars_cv_best = evaluate_model(lasso_lars_cv_best, cleaned_x_train, cleaned_y_train.ravel(), y_pred_lasso_lars_cv_best, k, True, "LassoLarsCV",df_lasso_cv_best)

print('Best alpha LASSOLARSCV: ', lasso_lars_cv_best.alpha_)

Best alpha LASSOLARSCV:  0.0


    4.1.3.4 Ridge Cross Validation

In [35]:
from sklearn.linear_model import RidgeCV

k = 10 
alpha_values = np.arange(0.00001, 1, 0.01)

ridge_cv_best = RidgeCV(alphas = alpha_values, cv = k).fit(cleaned_x_train, cleaned_y_train.ravel())
y_pred_ridge_cv_best = ridge_cv_best.predict(cleaned_x_train)

df_ridge_cv_best = evaluate_model(ridge_cv_best, cleaned_x_train, cleaned_y_train.ravel(), y_pred_ridge_cv_best, k, True, "RidgeCV",df_lasso_lars_cv_best)

print('Best alpha RIDGECV: ', ridge_cv_best.alpha_)

Best alpha RIDGECV:  0.9900100000000001


    4.1.3.5 Elastic Net Cross Validation

In [36]:
from sklearn.linear_model import ElasticNetCV

l1_ratios=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
l1_ratios=[0.3, 0.4, 0.5, 0.6, 0.7]
alpha_values = np.arange(0.0001, 1, 0.1)

elastic_net_cv_best = ElasticNetCV(l1_ratio = 0.4 , alphas = alpha_values, cv = k).fit(cleaned_x_train, cleaned_y_train.ravel())
y_pred_elastic_net_cv_best = elastic_net_cv_best.predict(cleaned_x_train)

df_elastic_net_cv_best = evaluate_model(elastic_net_cv_best, cleaned_x_train, cleaned_y_train.ravel(), y_pred_elastic_net_cv_best, k, True, "ElasticNetCV",df_ridge_cv_best)

print('Best alpha ELASTICNETCV: ', elastic_net_cv_best.alpha_)

Best alpha ELASTICNETCV:  0.0001


FINAL RESULTS

In [37]:
print(df_elastic_net_cv_best)

          Model       MSE       MAE        R²       SSE
0        l-regr  0.000262  0.012804  0.999858  0.036645
1       LassoCV  0.000245  0.012436  0.999858  0.036806
2   LassoLarsCV  0.000245  0.012417  0.999858  0.036774
3       RidgeCV  0.000245  0.012417  0.999858  0.036774
4  ElasticNetCV  0.000245  0.012436  0.999858  0.036808


FINAL SOLUTION

In [38]:
x_test = np.load('x_test.npy')

y_test = regr_best.predict(x_test)

np.save('y_test.npy', y_test)

print("Model coefficients (weights):", regr_best.coef_)
print("Model intercept (bias):", regr_best.intercept_)

y_test_loaded = np.load('y_test.npy')

Model coefficients (weights): [-4.77750857e-03 -5.20362315e-03 -1.82833635e-03 -4.13199153e-05
 -2.20369897e-03]
Model intercept (bias): 0.9971905630412329
