In [14]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import add_dummy_feature
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

In [15]:
# Función para comparar los coeficientes de dos modelos hecha con ayuda de Claude sonnet 3.7
def compare_coefs(W1, W2):
    if len(W1) != len(W2):
        print(f"Arrays have different lengths: {len(W1)} vs {len(W2)}")
        return False
    
    max_diff = 0
    for i in range(len(W1)):
        diff = W1[i] - W2[i]
        max_diff = max(max_diff, abs(diff))
        print(f'{W1[i]:.6f} - {W2[i]:.6f} = {diff:.10f}')
            
    # Check for equality within a small tolerance to account for floating point errors
    are_equal = np.allclose(W1, W2, rtol=1e-10, atol=1e-10)
    print(f"Los coeficientes son iguales: {are_equal}")
    return are_equal

In [16]:
source_file = '../data/processed/Airbnb_Feature_Engineered.csv'
df = pd.read_csv(source_file, low_memory=False)

# Dividir primero en train+valid y test
train_valid, test = train_test_split(df, test_size=0.2, random_state=42)

# Dividir train+valid en train y valid
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

train.to_csv('../data/processed/Airbnb_Train.csv', index=False)
valid.to_csv('../data/processed/Airbnb_Valid.csv', index=False)
test.to_csv('../data/processed/Airbnb_Test.csv', index=False)

In [17]:
# Definir las matrices X y Y
Y_train = train['log_price']
X_train = train.drop(columns=['log_price'])

Y_valid = valid['log_price']
X_valid = valid.drop(columns=['log_price'])

# Convertir a numpy arrays
X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()

X_valid = X_valid.to_numpy()
Y_valid = Y_valid.to_numpy()

# Añadir una columna de unos para el término de intercepción
X_train = add_dummy_feature(X_train)
X_valid = add_dummy_feature(X_valid)

**a) Regresión Lineal con Ecuación Normal**
*   Implementación propia.

In [18]:
# Calcular los pesos usando la ecuación normal
W = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ Y_train
W

array([ 5.06282056,  0.16828848,  0.24649662,  0.1401294 , -0.05570871,
        0.05548442,  0.01600112,  0.08523612, -0.01536713, -0.08214661,
       -0.93346524, -0.50678949])

*   Uso de sklearn (LinearRegression).

In [19]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(X_train, Y_train)
W_sklearn = model.coef_
W_sklearn

array([ 5.06282056,  0.16828848,  0.24649662,  0.1401294 , -0.05570871,
        0.05548442,  0.01600112,  0.08523612, -0.01536713, -0.08214661,
       -0.93346524, -0.50678949])

In [20]:
compare_coefs(W, W_sklearn)

# Guardar los pesos en un archivo .pkl
with open('../models/ecuacion_normal.pkl', 'wb') as file:
    pickle.dump(W_sklearn, file)

5.062821 - 5.062821 = -0.0000000000
0.168288 - 0.168288 = -0.0000000000
0.246497 - 0.246497 = 0.0000000000
0.140129 - 0.140129 = 0.0000000000
-0.055709 - -0.055709 = 0.0000000000
0.055484 - 0.055484 = -0.0000000000
0.016001 - 0.016001 = -0.0000000000
0.085236 - 0.085236 = 0.0000000000
-0.015367 - -0.015367 = 0.0000000000
-0.082147 - -0.082147 = 0.0000000000
-0.933465 - -0.933465 = -0.0000000000
-0.506789 - -0.506789 = -0.0000000000
Los coeficientes son iguales: True


**b) Regresión Lineal con Singular Value Decomposition (SVD)**
*   Implementación con librerías de álgebra lineal.

In [21]:
U, sigma, Vt = np.linalg.svd(X_train, full_matrices=False)
sigma_plus = np.diag(1/sigma)
W_svd = Vt.T @ sigma_plus @ U.T @ Y_train
W_svd

array([ 5.06282056,  0.16828848,  0.24649662,  0.1401294 , -0.05570871,
        0.05548442,  0.01600112,  0.08523612, -0.01536713, -0.08214661,
       -0.93346524, -0.50678949])

*   Uso de sklearn (LinearRegression).

In [22]:
model = LinearRegression(fit_intercept=False)
model.fit(X_train, Y_train)  # Automáticamente usa SVD para matrices singulares
W_sklearn_svd = model.coef_
W_sklearn_svd

array([ 5.06282056,  0.16828848,  0.24649662,  0.1401294 , -0.05570871,
        0.05548442,  0.01600112,  0.08523612, -0.01536713, -0.08214661,
       -0.93346524, -0.50678949])

In [23]:
compare_coefs(W_svd, W_sklearn_svd)
# Guardar los pesos en un archivo .pkl
with open('../models/svd.pkl', 'wb') as file:
    pickle.dump(W_sklearn_svd, file)

5.062821 - 5.062821 = 0.0000000000
0.168288 - 0.168288 = -0.0000000000
0.246497 - 0.246497 = 0.0000000000
0.140129 - 0.140129 = -0.0000000000
-0.055709 - -0.055709 = 0.0000000000
0.055484 - 0.055484 = -0.0000000000
0.016001 - 0.016001 = -0.0000000000
0.085236 - 0.085236 = 0.0000000000
-0.015367 - -0.015367 = 0.0000000000
-0.082147 - -0.082147 = 0.0000000000
-0.933465 - -0.933465 = -0.0000000000
-0.506789 - -0.506789 = 0.0000000000
Los coeficientes son iguales: True


**c) Regresión Polinomial**
*   Uso de sklearn (PolynomialFeatures + LinearRegression).

In [30]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Crear características polinomiales de grado 2
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_train)

model_poly = LinearRegression()
model_poly.fit(X_poly, Y_train)
W_poly = model_poly.coef_
print(W_poly)

# Guardar los pesos en un archivo .pkl
with open('../models/polynomial.pkl', 'wb') as file:
    pickle.dump(W_poly, file)

[ 1.04972724e-17  8.85282533e-02  1.28324399e-01  5.94007365e-02
 -1.24729618e-02  7.07460756e-02  7.33616443e-03  2.88625294e-02
 -7.14636552e-03 -2.63522733e-02 -2.90567872e-01 -1.69157308e-01
  1.66533454e-16  8.85282533e-02  1.28324399e-01  5.94007365e-02
 -1.24729618e-02  7.07460756e-02  7.33616443e-03  2.88625294e-02
 -7.14636552e-03 -2.63522733e-02 -2.90567872e-01 -1.69157308e-01
 -4.99060016e-03  1.76503720e-02  2.17041605e-02 -2.10435804e-02
 -4.62493720e-03 -1.38923272e-02  4.23831309e-02 -1.51876678e-02
 -6.26921410e-02  1.39483661e-01  1.04829003e-01  2.82646279e-03
 -1.95892488e-03 -1.45683913e-02 -6.82608868e-03 -6.80023891e-04
  3.77418639e-03  6.72937642e-03 -2.46405819e-02  1.54560688e-02
  1.03536702e-03  3.55763229e-04 -1.85022955e-02 -9.54812897e-03
  3.87107402e-03 -4.14255370e-03  6.52335284e-03 -1.66630656e-04
  8.62245039e-02 -8.95073146e-02  1.33825319e-02  1.64786880e-03
 -2.31452555e-03  3.89395422e-03  5.08226538e-03  2.61596459e-02
 -1.92746070e-01 -2.94611

*   Análisis del grado del polinomio y su efecto en el sobreajuste.

In [25]:
# Análisis de sobreajuste 
for g in range(1,6):
    model = make_pipeline(PolynomialFeatures(g), LinearRegression())
    model.fit(X_train, Y_train)
    train_score = model.score(X_train, Y_train)
    test_score = model.score(X_valid, Y_valid)
    print(f"Grado {g}: Train R2={train_score:.2f}, Test R2={test_score:.2f}")

Grado 1: Train R2=0.62, Test R2=0.63
Grado 2: Train R2=0.64, Test R2=0.64
Grado 3: Train R2=0.65, Test R2=0.63
Grado 4: Train R2=0.66, Test R2=0.56
Grado 5: Train R2=0.68, Test R2=-92.98


Al ver como mejora la "puntuación" en el conjunto de Train pero no en el de Test a medida que incrementamos los grados del polinomio, entonces podemos observar que se esta produciendo overfitting.


**d) Regresión Lineal con Batch Gradient Descent (BGD)**
*   Implementación propia de BGD.

In [42]:
lr = 0.1  
n_epocas = 1000
m = len(X_train)

np.random.seed(42)
W_bgd = np.random.randn(X_train.shape[1], 1)
print(f"Pesos iniciales: {W_bgd.T}")

for epoch in range(n_epocas):
    gradient = (2/m) * X_train.T @ (X_train @ W_bgd - Y_train.reshape(-1, 1))
    W_bgd = W_bgd - lr * gradient

W_bgd = W_bgd.ravel()
print(f"Pesos finales: {W_bgd}")

Pesos iniciales: [[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473 -0.46947439  0.54256004 -0.46341769 -0.46572975]]
Pesos finales: [ 5.06179991  0.16919116  0.24667375  0.14009491 -0.05621795  0.05533052
  0.01601913  0.08517485 -0.01537866 -0.08175729 -0.92538415 -0.50564213]


*   Uso de sklearn con SGDRegressor (ajustado a modo batch).

In [51]:
from sklearn.linear_model import SGDRegressor

model_bgd = SGDRegressor(
    learning_rate='constant',
    eta0=0.01,
    tol=1e-5,
    penalty=None,
    shuffle=True,
)

# Entrenar el modelo en modo batch
for epoch in range(1000):  
    model_bgd.partial_fit(X_train, Y_train)

W_bgd_sklearn = model_bgd.coef_
print(W_bgd_sklearn)

[ 2.53123287  0.1386333   0.20536918  0.08245897 -0.00341401  0.17376344
  0.02812791  0.04840064 -0.03071304 -0.12786845 -0.94421795 -0.5281051 ]


In [49]:
compare_coefs(W_bgd, W_bgd_sklearn)

with open('../models/bgd.pkl', 'wb') as file:
    pickle.dump(W_bgd, file)

with open('../models/bgd_sklearn.pkl', 'wb') as file:
    pickle.dump(W_bgd_sklearn, file)

5.061800 - 2.574661 = 2.4871388161
0.169191 - -0.050088 = 0.2192788452
0.246674 - 0.201256 = 0.0454175165
0.140095 - -0.268252 = 0.4083466857
-0.056218 - 0.163318 = -0.2195357209
0.055331 - 0.100234 = -0.0449035297
0.016019 - -0.022534 = 0.0385536119
0.085175 - -0.163165 = 0.2483399169
-0.015379 - 0.007209 = -0.0225872215
-0.081757 - -0.362082 = 0.2803245188
-0.925384 - -0.802704 = -0.1226796979
-0.505642 - -0.580170 = 0.0745280892
Los coeficientes son iguales: False


**e) Regresión Lineal con Stochastic Gradient Descent (SGD)**
*   Implementación propia de SGD.

In [None]:
def programa_aprendizaje(t):
    return lr_init_sch / (t + lr_end_sch)

n_epocas = 500
lr_init_sch = 5
lr_end_sch = 50
m = len(X_train)

np.random.seed(42)
W_sgd = np.random.randn(X_train.shape[1], 1)
print(f"Pesos iniciales: {W_sgd}")

for epoca in range(n_epocas):
    for iteracion in range(m):
        indice_aleatorio = np.random.randint(m)
        xi = X_train[indice_aleatorio : indice_aleatorio + 1]
        yi = Y_train[indice_aleatorio : indice_aleatorio + 1]

        gradient = 2 * xi.T @ (xi @ W_sgd - yi)

        lr = programa_aprendizaje(epoca * m + iteracion)
        W_sgd = W_sgd - lr * gradient

W_sgd = W_sgd.ravel()
print(f"Pesos finales: {W_sgd}")

Pesos iniciales: [[ 0.49671415]
 [-0.1382643 ]
 [ 0.64768854]
 [ 1.52302986]
 [-0.23415337]
 [-0.23413696]
 [ 1.57921282]
 [ 0.76743473]
 [-0.46947439]
 [ 0.54256004]
 [-0.46341769]
 [-0.46572975]]


*   Uso de sklearn (SGDRegressor).

In [52]:
model_sgd = SGDRegressor(max_iter=1000, penalty=None, eta0=0.1)
model_sgd.fit(X_train, Y_train)
W_sgd_sklearn = model_sgd.coef_
W_sgd_sklearn

array([ 2.48679514,  0.19311269,  0.21810211,  0.10402339, -0.06250622,
        0.03673825,  0.04482219,  0.0502914 ,  0.00780711, -0.10905209,
       -0.93441951, -0.4973767 ])

In [57]:
compare_coefs(W_sgd, W_sgd_sklearn)

with open('../models/sgd.pkl', 'wb') as file:
    pickle.dump(W_sgd, file)

with open('../models/sgd_sklearn.pkl', 'wb') as file:
    pickle.dump(W_sgd_sklearn, file)

5.054705 - 2.486795 = 2.5679096212
0.174551 - 0.193113 = -0.0185621691
0.247177 - 0.218102 = 0.0290745508
0.139764 - 0.104023 = 0.0357403548
-0.058617 - -0.062506 = 0.0038891294
0.055883 - 0.036738 = 0.0191451561
0.015467 - 0.044822 = -0.0293551102
0.083975 - 0.050291 = 0.0336832034
-0.015817 - 0.007807 = -0.0236238618
-0.078652 - -0.109052 = 0.0303995891
-0.875322 - -0.934420 = 0.0590976279
-0.497798 - -0.497377 = -0.0004217089
Los coeficientes son iguales: False


**f) Lasso Regression (sólo librería)**

In [None]:
from sklearn.linear_model import Lasso

model_lasso = Lasso(alpha=0.1)
model_lasso.fit(X_train, Y_train)
W_lasso = model_lasso.coef_
print(W_lasso)

with open('../models/lasso.pkl', 'wb') as file:
    pickle.dump(W_lasso, file)

**g) Ridge Regression (sólo librería)**

In [None]:
from sklearn.linear_model import Ridge

model_ridge = Ridge(alpha=0.1)
model_ridge.fit(X_train, Y_train)
W_ridge = model_ridge.coef_
print(W_ridge)

with open('../models/ridge.pkl', 'wb') as file:
    pickle.dump(W_ridge, file)