In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import add_dummy_feature
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

In [32]:
# Función para comparar los coeficientes de dos modelos hecha con ayuda de Claude sonnet 3.7
def compare_coefs(W1, W2):
    if len(W1) != len(W2):
        print(f"Arrays have different lengths: {len(W1)} vs {len(W2)}")
        return False
    
    max_diff = 0
    for i in range(len(W1)):
        diff = W1[i] - W2[i]
        max_diff = max(max_diff, abs(diff))
        print(f'{W1[i]:.6f} - {W2[i]:.6f} = {diff:.10f}')
            
    # Check for equality within a small tolerance to account for floating point errors
    are_equal = np.allclose(W1, W2, rtol=1e-10, atol=1e-10)
    print(f"Los coeficientes son iguales: {are_equal}")
    return are_equal

In [33]:
source_file = '../data/processed/Airbnb_Feature_Engineered.csv'
df = pd.read_csv(source_file, low_memory=False)

# Dividir primero en train+valid y test
train_valid, test = train_test_split(df, test_size=0.2, random_state=42)

# Dividir train+valid en train y valid
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

train.to_csv('../data/processed/Airbnb_Train.csv', index=False)
valid.to_csv('../data/processed/Airbnb_Valid.csv', index=False)
test.to_csv('../data/processed/Airbnb_Test.csv', index=False)

In [34]:
# Definir las matrices X y Y
Y_train = train['log_price']
X_train = train.drop(columns=['log_price'])

Y_valid = valid['log_price']
X_valid = valid.drop(columns=['log_price'])

# Convertir a numpy arrays
X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()

X_valid = X_valid.to_numpy()
Y_valid = Y_valid.to_numpy()

# Añadir una columna de unos para el término de intercepción
X_train = add_dummy_feature(X_train)
X_valid = add_dummy_feature(X_valid)

**a) Regresión Lineal con Ecuación Normal**
*   Implementación propia.

In [35]:
# Calcular los pesos usando la ecuación normal
W = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ Y_train
W

array([ 5.06464045e+00,  1.68135361e-01,  2.49744206e-01,  1.40129705e-01,
       -5.58146368e-02,  5.54764751e-02, -1.06966420e-02,  1.63695738e-02,
        8.51162873e-02, -1.53606121e-02, -8.15967584e-02,  3.90669119e-03,
       -9.33333932e-01, -5.07140448e-01])

*   Uso de sklearn (LinearRegression).

In [36]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)
model.fit(X_train, Y_train)
W_sklearn = model.coef_
W_sklearn

array([ 5.06464045e+00,  1.68135361e-01,  2.49744206e-01,  1.40129705e-01,
       -5.58146368e-02,  5.54764751e-02, -1.06966420e-02,  1.63695738e-02,
        8.51162873e-02, -1.53606121e-02, -8.15967584e-02,  3.90669119e-03,
       -9.33333932e-01, -5.07140448e-01])

In [37]:
compare_coefs(W, W_sklearn)

# Guardar los pesos en un archivo .pkl
with open('../models/ecuacion_normal.pkl', 'wb') as file:
    pickle.dump(W_sklearn, file)

5.064640 - 5.064640 = -0.0000000000
0.168135 - 0.168135 = -0.0000000000
0.249744 - 0.249744 = 0.0000000000
0.140130 - 0.140130 = 0.0000000000
-0.055815 - -0.055815 = 0.0000000000
0.055476 - 0.055476 = -0.0000000000
-0.010697 - -0.010697 = -0.0000000000
0.016370 - 0.016370 = 0.0000000000
0.085116 - 0.085116 = 0.0000000000
-0.015361 - -0.015361 = 0.0000000000
-0.081597 - -0.081597 = -0.0000000000
0.003907 - 0.003907 = 0.0000000000
-0.933334 - -0.933334 = -0.0000000000
-0.507140 - -0.507140 = -0.0000000000
Los coeficientes son iguales: True


**b) Regresión Lineal con Singular Value Decomposition (SVD)**
*   Implementación con librerías de álgebra lineal.

In [38]:
U, sigma, Vt = np.linalg.svd(X_train, full_matrices=False)
sigma_plus = np.diag(1/sigma)
W_svd = Vt.T @ sigma_plus @ U.T @ Y_train
W_svd

array([ 5.06464045e+00,  1.68135361e-01,  2.49744206e-01,  1.40129705e-01,
       -5.58146368e-02,  5.54764751e-02, -1.06966420e-02,  1.63695738e-02,
        8.51162873e-02, -1.53606121e-02, -8.15967584e-02,  3.90669119e-03,
       -9.33333932e-01, -5.07140448e-01])

*   Uso de sklearn (LinearRegression).

In [39]:
model = LinearRegression(fit_intercept=False)
model.fit(X_train, Y_train)  # Automáticamente usa SVD para matrices singulares
W_sklearn_svd = model.coef_
W_sklearn_svd

array([ 5.06464045e+00,  1.68135361e-01,  2.49744206e-01,  1.40129705e-01,
       -5.58146368e-02,  5.54764751e-02, -1.06966420e-02,  1.63695738e-02,
        8.51162873e-02, -1.53606121e-02, -8.15967584e-02,  3.90669119e-03,
       -9.33333932e-01, -5.07140448e-01])

In [40]:
compare_coefs(W_svd, W_sklearn_svd)
# Guardar los pesos en un archivo .pkl
with open('../models/svd.pkl', 'wb') as file:
    pickle.dump(W_sklearn_svd, file)

5.064640 - 5.064640 = -0.0000000000
0.168135 - 0.168135 = -0.0000000000
0.249744 - 0.249744 = -0.0000000000
0.140130 - 0.140130 = 0.0000000000
-0.055815 - -0.055815 = 0.0000000000
0.055476 - 0.055476 = 0.0000000000
-0.010697 - -0.010697 = -0.0000000000
0.016370 - 0.016370 = -0.0000000000
0.085116 - 0.085116 = -0.0000000000
-0.015361 - -0.015361 = -0.0000000000
-0.081597 - -0.081597 = -0.0000000000
0.003907 - 0.003907 = -0.0000000000
-0.933334 - -0.933334 = -0.0000000000
-0.507140 - -0.507140 = -0.0000000000
Los coeficientes son iguales: True


**c) Regresión Polinomial**
*   Uso de sklearn (PolynomialFeatures + LinearRegression).

In [41]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Crear características polinomiales de grado 2
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_train)

model_poly = LinearRegression()
model_poly.fit(X_poly, Y_train)
W_poly = model_poly.coef_
print(W_poly)

# Guardar los pesos en un archivo .pkl
with open('../models/polynomial.pkl', 'wb') as file:
    pickle.dump(W_poly, file)

[ 1.09964866e-17  9.02659752e-02  1.41999789e-01  5.89747630e-02
 -1.47610911e-02  7.00889237e-02 -2.47446386e-02  9.89461229e-03
  2.80948351e-02 -9.75302819e-03 -3.14652475e-02 -3.04317719e-05
 -2.90553658e-01 -1.70719611e-01  2.01227923e-16  9.02659752e-02
  1.41999789e-01  5.89747630e-02 -1.47610911e-02  7.00889237e-02
 -2.47446386e-02  9.89461229e-03  2.80948351e-02 -9.75302819e-03
 -3.14652475e-02 -3.04317719e-05 -2.90553658e-01 -1.70719611e-01
 -5.98274425e-03  2.43915689e-02  2.22243513e-02 -2.04326688e-02
 -4.49865220e-03 -2.60696809e-02 -1.28150201e-02  4.16947242e-02
 -1.51083819e-02 -6.01167907e-02  7.50032194e-03  1.34661415e-01
  1.05724628e-01 -1.28086987e-03 -5.07253305e-04 -2.11043735e-02
 -6.09600576e-03 -5.22202585e-02 -7.93517746e-04 -1.79657010e-03
 -1.25470890e-03 -3.88317782e-02 -5.81199236e-03  4.29738421e-02
 -1.38665120e-03 -3.74515482e-04 -1.87554418e-02 -8.44389866e-03
 -1.29314335e-02  3.91436058e-03 -2.19534556e-03  7.79497001e-03
  2.91633580e-03 -3.62774

*   Análisis del grado del polinomio y su efecto en el sobreajuste.

In [42]:
# Análisis de sobreajuste 
for g in range(1,5):
    model = make_pipeline(PolynomialFeatures(g), LinearRegression())
    model.fit(X_train, Y_train)
    train_score = model.score(X_train, Y_train)
    test_score = model.score(X_valid, Y_valid)
    print(f"Grado {g}: Train R2={train_score:.2f}, Test R2={test_score:.2f}")

Grado 1: Train R2=0.62, Test R2=0.63
Grado 2: Train R2=0.64, Test R2=0.64
Grado 3: Train R2=0.65, Test R2=0.64
Grado 4: Train R2=0.67, Test R2=-0.02


Al ver como mejora la "puntuación" en el conjunto de Train pero no en el de Test a medida que incrementamos los grados del polinomio, entonces podemos observar que se esta produciendo overfitting.


**d) Regresión Lineal con Batch Gradient Descent (BGD)**
*   Implementación propia de BGD.

In [43]:
lr = 0.1  
n_epocas = 1000
m = len(X_train)

np.random.seed(42)
W_bgd = np.random.randn(X_train.shape[1], 1)
print(f"Pesos iniciales: {W_bgd.T}")

for epoch in range(n_epocas):
    gradient = (2/m) * X_train.T @ (X_train @ W_bgd - Y_train.reshape(-1, 1))
    W_bgd = W_bgd - lr * gradient

W_bgd = W_bgd.ravel()
print(f"Pesos finales: {W_bgd}")

Pesos iniciales: [[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473 -0.46947439  0.54256004 -0.46341769 -0.46572975
   0.24196227 -1.91328024]]
Pesos finales: [ 5.06315354e+00  1.69441255e-01  2.49991851e-01  1.40080227e-01
 -5.65509334e-02  5.52538856e-02 -1.06187768e-02  1.63925557e-02
  8.50039438e-02 -1.53790244e-02 -8.10354139e-02  3.92355863e-03
 -9.21660335e-01 -5.05482998e-01]


*   Uso de sklearn con SGDRegressor (ajustado a modo batch).

In [44]:
from sklearn.linear_model import SGDRegressor

model_bgd = SGDRegressor(
    learning_rate='constant',
    eta0=0.01,
    tol=1e-5,
    penalty=None,
    shuffle=True,
)

# Entrenar el modelo en modo batch
for epoch in range(1000):  
    model_bgd.partial_fit(X_train, Y_train)

W_bgd_sklearn = model_bgd.coef_
print(W_bgd_sklearn)

[ 2.56129868  0.17858196  0.25906001  0.13790172 -0.1012191   0.04230188
 -0.03462789  0.04387672  0.06038697  0.00459016 -0.13425867  0.02461154
 -0.90674369 -0.55497279]


In [45]:
compare_coefs(W_bgd, W_bgd_sklearn)

with open('../models/bgd.pkl', 'wb') as file:
    pickle.dump(W_bgd, file)

with open('../models/bgd_sklearn.pkl', 'wb') as file:
    pickle.dump(W_bgd_sklearn, file)

5.063154 - 2.561299 = 2.5018548602
0.169441 - 0.178582 = -0.0091407026
0.249992 - 0.259060 = -0.0090681594
0.140080 - 0.137902 = 0.0021785031
-0.056551 - -0.101219 = 0.0446681646
0.055254 - 0.042302 = 0.0129520041
-0.010619 - -0.034628 = 0.0240091108
0.016393 - 0.043877 = -0.0274841646
0.085004 - 0.060387 = 0.0246169763
-0.015379 - 0.004590 = -0.0199691850
-0.081035 - -0.134259 = 0.0532232592
0.003924 - 0.024612 = -0.0206879786
-0.921660 - -0.906744 = -0.0149166441
-0.505483 - -0.554973 = 0.0494897882
Los coeficientes son iguales: False


**e) Regresión Lineal con Stochastic Gradient Descent (SGD)**
*   Implementación propia de SGD.

In [46]:
def programa_aprendizaje(t):
    return lr_init_sch / (t + lr_end_sch)

n_epocas = 50
lr_init_sch = 5
lr_end_sch = 50
m = len(X_train)

np.random.seed(42)
W_sgd = np.random.randn(X_train.shape[1], 1)
print(f"Pesos iniciales: {W_sgd}")

for epoca in range(n_epocas):
    for iteracion in range(m):
        indice_aleatorio = np.random.randint(m)
        xi = X_train[indice_aleatorio : indice_aleatorio + 1]
        yi = Y_train[indice_aleatorio : indice_aleatorio + 1]

        gradient = 2 * xi.T @ (xi @ W_sgd - yi)

        lr = programa_aprendizaje(epoca * m + iteracion)
        W_sgd = W_sgd - lr * gradient

W_sgd = W_sgd.ravel()
print(f"Pesos finales: {W_sgd}")

Pesos iniciales: [[ 0.49671415]
 [-0.1382643 ]
 [ 0.64768854]
 [ 1.52302986]
 [-0.23415337]
 [-0.23413696]
 [ 1.57921282]
 [ 0.76743473]
 [-0.46947439]
 [ 0.54256004]
 [-0.46341769]
 [-0.46572975]
 [ 0.24196227]
 [-1.91328024]]


Pesos finales: [ 5.05189347e+00  1.76986975e-01  2.52865430e-01  1.37993566e-01
 -6.02262243e-02  5.38593050e-02 -8.79886716e-03  1.60545472e-02
  8.77003979e-02 -1.54315776e-02 -7.60350310e-02  4.43667207e-03
 -8.47890252e-01 -4.94618026e-01]


*   Uso de sklearn (SGDRegressor).

In [90]:
model_sgd = SGDRegressor(max_iter=1000, penalty=None, eta0=0.1)
model_sgd.fit(X_train, Y_train)
W_sgd_sklearn = model_sgd.coef_
W_sgd_sklearn

array([ 2.52151866,  0.18284067,  0.26753727,  0.14040043, -0.02192094,
        0.02719179,  0.00521287, -0.02753908,  0.08511565, -0.00754343,
       -0.06074031,  0.02656101, -0.91418956, -0.54241738])

In [48]:
compare_coefs(W_sgd, W_sgd_sklearn)

with open('../models/sgd.pkl', 'wb') as file:
    pickle.dump(W_sgd, file)

with open('../models/sgd_sklearn.pkl', 'wb') as file:
    pickle.dump(W_sgd_sklearn, file)

5.051893 - 2.517431 = 2.5344627891
0.176987 - 0.127562 = 0.0494250726
0.252865 - 0.244902 = 0.0079638997
0.137994 - 0.105546 = 0.0324478913
-0.060226 - -0.029602 = -0.0306245274
0.053859 - 0.042002 = 0.0118574463
-0.008799 - -0.026102 = 0.0173028874
0.016055 - 0.002849 = 0.0132056616
0.087700 - 0.091160 = -0.0034595145
-0.015432 - 0.016589 = -0.0320204937
-0.076035 - -0.077346 = 0.0013111471
0.004437 - 0.015446 = -0.0110097895
-0.847890 - -0.943854 = 0.0959634026
-0.494618 - -0.479647 = -0.0149707665
Los coeficientes son iguales: False


**f) Lasso Regression (sólo librería)**

In [49]:
from sklearn.linear_model import Lasso

model_lasso = Lasso(alpha=0.1, fit_intercept=False)
model_lasso.fit(X_train, Y_train)
W_lasso = model_lasso.coef_
print(W_lasso)

with open('../models/lasso.pkl', 'wb') as file:
    pickle.dump(W_lasso, file)

[ 4.67073082  0.25476064  0.21903217  0.03341096  0.          0.
  0.          0.          0.          0.          0.         -0.
 -0.         -0.        ]


**g) Ridge Regression (sólo librería)**

In [50]:
from sklearn.linear_model import Ridge

model_ridge = Ridge(alpha=0.1, fit_intercept=False)
model_ridge.fit(X_train, Y_train)
W_ridge = model_ridge.coef_
print(W_ridge)

with open('../models/ridge.pkl', 'wb') as file:
    pickle.dump(W_ridge, file)

[ 5.06456261e+00  1.68151625e-01  2.49745148e-01  1.40126618e-01
 -5.58178320e-02  5.54737819e-02 -1.06801314e-02  1.63660653e-02
  8.51223699e-02 -1.53658589e-02 -8.15427690e-02  3.90645583e-03
 -9.33210890e-01 -5.07092222e-01]
