In [70]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from tqdm import tqdm

In [71]:
test_df = pd.read_json("data/train.json")

level_map = {'low': 1, 'medium': 2, 'high': 3}
test_df['interest_level'] = test_df['interest_level'].map(level_map)
X = test_df
y = test_df['price']

In [72]:
def generate_feature_list():
    result_feature = []
    feature_list = ['golden toilet', 'platinum toilet', 'moscow city picture', 'bill clinton was here', 
                    'silver wolf hacked', 'directed by robert b. weide', 'cybersport enjoyer',
                    'school 21 penaltist'
                    ]
    feature_num = np.random.randint(1, len(feature_list))
    for _ in range(0, feature_num):
        result_feature.append(feature_list[np.random.randint(1, len(feature_list))])
    return result_feature

In [73]:
X['Features'] = test_df.apply(lambda _: generate_feature_list(), axis=1)
X['Features'] = X['Features'].replace(',', '')

In [74]:
X.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level',
       'Features'],
      dtype='object')

In [75]:
features = []
for i, r in X.iterrows():
    for j in r['features']:
        features.append(j)

feature_set = set(features)
print(f'unique values - {len(feature_set)}.\nFeatures  -  {feature_set}')

unique values - 1556.
Features  -  {'Recessed Lighting.', 'Central Heat', 'Available 06/04/16     Firepalce', 'The Most Sought after Location', '24/7 Doorman Concierge', 'Green Building', '2 year lease', 'Mins From Train', '** OVERSIZED 3BR HOME * SPARKLING CLEAN * TONS OF NATURAL LIGHT * 2 BLKS TO THE WATERFRONT & BEDFORD L STOP **', 'Swimming Pool', 'near NQRMG Trains', 'Herringbone Wood Floors', 'Library', 'Stainless Steel appliances', 'Package service', 'Air Conditioning: Unknown Type', 'Reduced Fee', 'Laundry in Building', '** SPRAWLING TRUE 3BR SUPER SHARE * FIT FOR KINGS! * UNIQUE RENOVATIONS * STEPS TO THE WATERFRONT * 1 BLK TO BEDFORD L STOP **', 'specific dog breeds up to 40lbs', '** CHIC CHELSEA FIND! * MASSIVE 4BR HOME * 2 FULL BATHS * CHEF INSPIRED KITCHEN * ELEV THAT OPENS INTO APT * CATS OK **', 'One Month Free', 'private roof deck', 'Penthouse', 'Madison Square Park', 'Flat screen TV and sound system', 'outdoor space', 'basketball court', 'Voice Intercom', 'Large Closet

In [76]:
top_20 = Counter(features).most_common(20)
top_20

[('Elevator', 25915),
 ('Cats Allowed', 23540),
 ('Hardwood Floors', 23527),
 ('Dogs Allowed', 22035),
 ('Doorman', 20898),
 ('Dishwasher', 20426),
 ('No Fee', 18062),
 ('Laundry in Building', 16344),
 ('Fitness Center', 13252),
 ('Pre-War', 9148),
 ('Laundry in Unit', 8738),
 ('Roof Deck', 6542),
 ('Outdoor Space', 5268),
 ('Dining Room', 5136),
 ('High Speed Internet', 4299),
 ('Balcony', 2992),
 ('Swimming Pool', 2730),
 ('Laundry In Building', 2593),
 ('New Construction', 2559),
 ('Terrace', 2283)]

In [77]:
top_20_feature_names = []
for f, c in top_20:
    top_20_feature_names.append(f)
    X[f] = 0

In [137]:
X.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level',
       'Features', 'Elevator', 'Cats Allowed', 'Hardwood Floors',
       'Dogs Allowed', 'Doorman', 'Dishwasher', 'No Fee',
       'Laundry in Building', 'Fitness Center', 'Pre-War', 'Laundry in Unit',
       'Roof Deck', 'Outdoor Space', 'Dining Room', 'High Speed Internet',
       'Balcony', 'Swimming Pool', 'Laundry In Building', 'New Construction',
       'Terrace'],
      dtype='object')

In [79]:
for index, row in X.iterrows():
    for feat in row['features']:
        if feat in top_20_feature_names:
            X.at[index, feat] = 1

In [80]:
X_feat = test_df[['bathrooms', 'bedrooms', 'interest_level'] + list(top_20_feature_names)]
X_feat

Unnamed: 0,bathrooms,bedrooms,interest_level,Elevator,Cats Allowed,Hardwood Floors,Dogs Allowed,Doorman,Dishwasher,No Fee,...,Laundry in Unit,Roof Deck,Outdoor Space,Dining Room,High Speed Internet,Balcony,Swimming Pool,Laundry In Building,New Construction,Terrace
4,1.0,1,2,0,1,1,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,1.0,2,1,1,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1.0,2,2,1,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
10,1.5,3,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,1.0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124000,1.0,3,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
124002,1.0,2,2,1,1,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
124004,1.0,1,2,1,1,1,1,0,1,1,...,1,0,0,1,0,0,0,0,0,0
124008,1.0,2,2,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,0


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.21)

In [82]:
class LinearRegressionSDG:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.coefficients = None
        self.learning_rate = learning_rate
        self.epochs = epochs

    def fit(self, X, y):
        """Trains the model using the normal equation method."""
        X = np.insert(X, 0, 1, axis=1)  # Add bias term (column of ones)
        self.coefficients = np.linalg.inv(X.T @ X) @ X.T @ y  # Compute weights

    def fit_sgd(self, X, y):
        """Trains the model using stochastic gradient descent."""
        X = np.insert(X, 0, 1, axis=1)  # Add bias term
        m, n = X.shape
        self.coefficients = np.zeros(n)  # Initialize weights
        y = y.to_numpy()
        for epoch in tqdm(range(self.epochs)):
            for i in range(m):
                xi = X[i, :].reshape(1, -1)
                yi = y[i]
                gradient = (xi @ self.coefficients - yi) * xi
                self.coefficients -= self.learning_rate * gradient.flatten()

    def predict(self, X):
        """Predicts target values based on trained model."""
        X = np.insert(X, 0, 1, axis=1)  # Add bias term
        return X @ self.coefficients  # Compute predictions
    
    def R2(self, y_test, y_pred):
        y_mean = np.mean(y_test)
        ss_res = np.sum((y_test - y_pred) ** 2)
        ss_tot = np.sum((y_test - y_mean) ** 2)
        r2 = 1 - (ss_res / ss_tot)
        self.r2 = r2
        return r2

#### LR vs SDG LR RMSE showcase

In [83]:
lr = LinearRegressionSDG()
lr.fit(X_train, y_train)
lrSDG = LinearRegressionSDG(epochs=500)
lrSDG.fit_sgd(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_pred_lrSDG = lrSDG.predict(X_test)
rmse_lr = root_mean_squared_error(y_test, y_pred_lr)
rmse_lrsdg = root_mean_squared_error(y_test, y_pred_lrSDG)
print(f"{rmse_lr} for LR and {rmse_lrsdg} for LR SDG")

100%|██████████| 500/500 [01:37<00:00,  5.15it/s]


2096.721381237224 for LR and 24949.055885684575 for LR SDG


In [84]:
lrSDG = LinearRegressionSDG()
lrSDG.fit(X_train, y_train)
y_pred = lrSDG.predict(X_test)
rmse_lrsdg = root_mean_squared_error(y_test, y_pred)
mae_lrsdg = mean_squared_error(y_test, y_pred)
r2_lrsdg = lrSDG.R2(y_test, y_pred)
print(r2_lrsdg, mae_lrsdg, rmse_lrsdg)

0.35499792559166676 4396240.550537332 2096.721381237224


In [85]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
rmse_lr = root_mean_squared_error(y_test, y_pred)
mae_lr = mean_squared_error(y_test, y_pred)
r2_lr = r2_score(y_test, y_pred)
print(r2_lr, mae_lr, rmse_lr)

0.3549979255916763 4396240.550537266 2096.7213812372083


In [86]:
X.shape

(49352, 36)

In [87]:
np.linalg.inv(X_feat.T @ X_feat) @ X_feat.T @ y

0     2312.567356
1      584.131631
2     -363.206221
3      581.311497
4      -62.989817
5     -172.035666
6      503.165757
7     1507.854492
8     -143.896245
9     -289.533302
10    -598.614240
11    -492.785046
12      25.771912
13     515.963174
14    -105.474428
15    -175.064682
16     161.330184
17    -336.135122
18    -206.974552
19      71.344221
20    -670.921630
21      44.644812
22     413.451935
dtype: float64

In [88]:
metrics_df = pd.DataFrame(columns=["Model", "MAE", "RMSE", "R2"]).astype(
    {"Model": str, "MAE": float, "RMSE": float, "R2": float}
)

In [89]:
new_rows = pd.DataFrame([
    {"Model": "LR scikit", "MAE": mae_lr, "RMSE": rmse_lr, "R2": r2_lr},
    {"Model": "LR implementation", "MAE": mae_lrsdg, "RMSE": rmse_lrsdg, "R2": r2_lrsdg}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [90]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LR scikit,4396241.0,2096.721381,0.354998
1,LR implementation,4396241.0,2096.721381,0.354998


In [91]:
class RidgeRegression:
    def __init__(self, alpha=1.0):  # Alpha is the regularization strength (λ)
        self.alpha = alpha
        self.theta = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Add bias (intercept) term: X -> [1, X]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  
        
        # Compute Ridge solution: (X^T X + λI)^(-1) X^T y
        I = np.eye(X_bias.shape[1])  # Identity matrix
        I[0, 0] = 0  # Do not regularize bias term
        self.theta = np.linalg.inv(X_bias.T @ X_bias + self.alpha * I) @ X_bias.T @ y

    def predict(self, X):
        n_samples = X.shape[0]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  # Add bias term
        return X_bias @ self.theta

    def get_params(self):
        return self.theta

In [92]:
ridge_model = RidgeRegression()
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
ridge_model_scikit = Ridge()
ridge_model_scikit.fit(X_train, y_train)
y_pred_sk = ridge_model_scikit.predict(X_test)
mae_sk = mean_squared_error(y_test, y_pred_sk)
rmse_sk = root_mean_squared_error(y_test, y_pred_sk)
r2_sk = r2_score(y_test, y_pred_sk)
print(root_mean_squared_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred_sk))

2096.681606218457 2096.681606218459


In [93]:
new_rows = pd.DataFrame([
    {"Model": "Ridge scikit", "MAE": mae_sk, "RMSE": rmse_sk, "R2": r2_sk},
    {"Model": "Ridge implementation", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [94]:
class LassoRegression:
    def __init__(self, alpha=1.0, learning_rate=0.01, n_iters=1000):
        self.alpha = alpha  # Regularization strength (λ)
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.theta = None

    def soft_threshold(self, rho, alpha):
        """Soft thresholding function for L1 penalty."""
        if rho < -alpha:
            return rho + alpha
        elif rho > alpha:
            return rho - alpha
        else:
            return 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Add bias (intercept) term: X -> [1, X]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  
        self.theta = np.zeros(X_bias.shape[1])  # Initialize weights
        
        for _ in range(self.n_iters):
            for j in range(len(self.theta)):  # Coordinate Descent
                X_j = X_bias[:, j]
                residual = y - X_bias @ self.theta + self.theta[j] * X_j
                rho_j = X_j.T @ residual  # Correlation
                if j == 0:  # Do not regularize bias term
                    self.theta[j] = rho_j / np.sum(X_j ** 2)
                else:
                    self.theta[j] = self.soft_threshold(rho_j, self.alpha) / np.sum(X_j ** 2)

    def predict(self, X):
        n_samples = X.shape[0]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  # Add bias term
        return X_bias @ self.theta

    def get_params(self):
        return self.theta

In [95]:
lasso = LassoRegression()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
lasso_sk = Lasso()
lasso_sk.fit(X_train, y_train)
y_pred_sk = lasso_sk.predict(X_test)
mae_sk = mean_squared_error(y_test, y_pred_sk)
rmse_sk = root_mean_squared_error(y_test, y_pred_sk)
r2_sk = r2_score(y_test, y_pred_sk)
print(root_mean_squared_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred_sk))

2096.721335743905 2095.05163440091


In [96]:
new_rows = pd.DataFrame([
    {"Model": "Lasso scikit", "MAE": mae_sk, "RMSE": rmse_sk, "R2": r2_sk},
    {"Model": "Lasso implementation", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [97]:
class ElasticNetModel:
    def __init__(self, alpha=1.0, l1_ratio=0.5, learning_rate=0.01, n_iters=1000):
        self.alpha = alpha  # Overall regularization strength (λ)
        self.l1_ratio = l1_ratio  # Ratio of L1 to L2 penalty
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.theta = None

    def soft_threshold(self, rho, alpha, l1_ratio):
        """Soft thresholding function for L1 penalty."""
        return np.sign(rho) * max(0, abs(rho) - l1_ratio * alpha)

    def fit(self, X, y):
        """Train ElasticNet model using Coordinate Descent."""
        n_samples, n_features = X.shape
        
        # Add bias (intercept) term: X -> [1, X]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  
        self.theta = np.zeros(X_bias.shape[1])  # Initialize weights
        
        for _ in range(self.n_iters):
            for j in range(len(self.theta)):  # Coordinate Descent
                X_j = X_bias[:, j]
                residual = y - X_bias @ self.theta + self.theta[j] * X_j
                rho_j = X_j.T @ residual  # Correlation
                
                if j == 0:  # Do not regularize bias term
                    self.theta[j] = rho_j / np.sum(X_j ** 2)
                else:
                    # Combine L1 and L2 regularization
                    l1_penalty = self.soft_threshold(rho_j, self.alpha, self.l1_ratio)
                    l2_penalty = (1 - self.l1_ratio) * self.alpha * self.theta[j]
                    self.theta[j] = (l1_penalty + l2_penalty) / np.sum(X_j ** 2)

    def predict(self, X):
        """Predict using trained ElasticNet model."""
        n_samples = X.shape[0]
        X_bias = np.c_[np.ones((n_samples, 1)), X]  # Add bias term
        return X_bias @ self.theta

    def get_params(self):
        """Return model coefficients (weights)."""
        return self.theta

In [98]:
elnet = ElasticNetModel()
elnet.fit(X_train, y_train)
y_pred = elnet.predict(X_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
elnet_sk = ElasticNet()
elnet_sk.fit(X_train, y_train)
y_pred_sk = elnet_sk.predict(X_test)
mae_sk = mean_squared_error(y_test, y_pred_sk)
rmse_sk = root_mean_squared_error(y_test, y_pred_sk)
r2_sk = r2_score(y_test, y_pred_sk)
print(root_mean_squared_error(y_test, y_pred), root_mean_squared_error(y_test, y_pred_sk))

2096.741266307018 2164.9150336468524


In [99]:
new_rows = pd.DataFrame([
    {"Model": "ElasticNet scikit", "MAE": mae_sk, "RMSE": rmse_sk, "R2": r2_sk},
    {"Model": "ElasticNet implementation", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [100]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LR scikit,4396241.0,2096.721381,0.354998
1,LR implementation,4396241.0,2096.721381,0.354998
2,Ridge scikit,4396074.0,2096.681606,0.355022
3,Ridge implementation,4396074.0,2096.681606,0.355022
4,Lasso scikit,4389241.0,2095.051634,0.356025
5,Lasso implementation,4396240.0,2096.721336,0.354998
6,ElasticNet scikit,4686857.0,2164.915034,0.31236
7,ElasticNet implementation,4396324.0,2096.741266,0.354986


##### Math formula for MinMaxScaler

x scaled = (x - xmin) / (xmax - xmin)

if we scale to a custom range [a, b]

x scaled = a + ((x - xmin)(b-a)) / (xmax - xmin)

In [101]:
def min_max_scaler(X, feature_range=(0,1)):
    X = np.array(X, dtype=np.float64)
    min_val, max_val = feature_range

    X_min = X.min(axis=0)
    X_max = X.max(axis=0)

    X_scaled = (X - X_min) / (X_max - X_min) * (max_val - min_val) + min_val

    return X_scaled

In [102]:
scaler_sk = MinMaxScaler()
scaled_sk = scaler_sk.fit_transform(X_feat)
scaled = min_max_scaler(X_feat)
scaled == scaled_sk

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

##### Math formula for StandartScaler

x scaled = (x - μ) / σ

μ - mean

σ - std

In [103]:
def standard_scaler(X):
    X = np.array(X, dtype=np.float64)
    mean = X.mean(axis=0)
    std = X.std(axis=0)

    X_scaled = (X - mean) / std
    return X_scaled

#### example of custom std and scaler

In [104]:
scaled = standard_scaler(X_feat)
scaler = StandardScaler()
scaled_sk = scaler.fit_transform(X_feat)
scaled_sk == scaled

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [105]:
st_scaler = StandardScaler()
st_scaled_sk_train = scaler.fit_transform(X_train)
st_scaled_sk_test = scaler.transform(X_test)

In [106]:
minmax_scaler = MinMaxScaler()
minmax_sk_train = minmax_scaler.fit_transform(X_train)
minmax_sk_test = minmax_scaler.transform(X_test)

In [107]:
st_lr = LinearRegression()
st_lr.fit(st_scaled_sk_train, y_train)
y_pred = st_lr.predict(st_scaled_sk_test)
mae_st = mean_squared_error(y_test, y_pred)
rmse_st = root_mean_squared_error(y_test, y_pred)
r2_st = r2_score(y_test, y_pred)

In [108]:
minmax_lr = LinearRegression()
minmax_lr.fit(minmax_sk_train, y_train)
y_pred = minmax_lr.predict(minmax_sk_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [109]:
new_rows = pd.DataFrame([
    {"Model": "LR MinMax", "MAE": mae, "RMSE": rmse, "R2": r2},
    {"Model": "LR Std", "MAE": mae_st, "RMSE": rmse_st, "R2": r2_st}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [110]:
st_lr = Ridge()
st_lr.fit(st_scaled_sk_train, y_train)
y_pred = st_lr.predict(st_scaled_sk_test)
mae_st = mean_squared_error(y_test, y_pred)
rmse_st = root_mean_squared_error(y_test, y_pred)
r2_st = r2_score(y_test, y_pred)

In [111]:
minmax_lr = Ridge()
minmax_lr.fit(minmax_sk_train, y_train)
y_pred = minmax_lr.predict(minmax_sk_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [112]:
new_rows = pd.DataFrame([
    {"Model": "Ridge MinMax", "MAE": mae, "RMSE": rmse, "R2": r2},
    {"Model": "Ridge Std", "MAE": mae_st, "RMSE": rmse_st, "R2": r2_st}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [113]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LR scikit,4396241.0,2096.721381,0.354998
1,LR implementation,4396241.0,2096.721381,0.354998
2,Ridge scikit,4396074.0,2096.681606,0.355022
3,Ridge implementation,4396074.0,2096.681606,0.355022
4,Lasso scikit,4389241.0,2095.051634,0.356025
5,Lasso implementation,4396240.0,2096.721336,0.354998
6,ElasticNet scikit,4686857.0,2164.915034,0.31236
7,ElasticNet implementation,4396324.0,2096.741266,0.354986
8,LR MinMax,4396241.0,2096.721381,0.354998
9,LR Std,4396241.0,2096.721381,0.354998


In [114]:
st_lr = Lasso()
st_lr.fit(st_scaled_sk_train, y_train)
y_pred = st_lr.predict(st_scaled_sk_test)
mae_st = mean_squared_error(y_test, y_pred)
rmse_st = root_mean_squared_error(y_test, y_pred)
r2_st = r2_score(y_test, y_pred)

In [115]:
minmax_lr = Lasso()
minmax_lr.fit(minmax_sk_train, y_train)
y_pred = minmax_lr.predict(minmax_sk_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [116]:
new_rows = pd.DataFrame([
    {"Model": "Lasso MinMax", "MAE": mae, "RMSE": rmse, "R2": r2},
    {"Model": "Lasso Std", "MAE": mae_st, "RMSE": rmse_st, "R2": r2_st}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [117]:
st_lr = ElasticNet()
st_lr.fit(st_scaled_sk_train, y_train)
y_pred = st_lr.predict(st_scaled_sk_test)
mae_st = mean_squared_error(y_test, y_pred)
rmse_st = root_mean_squared_error(y_test, y_pred)
r2_st = r2_score(y_test, y_pred)

In [118]:
minmax_lr = ElasticNet()
minmax_lr.fit(minmax_sk_train, y_train)
y_pred = minmax_lr.predict(minmax_sk_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [119]:
new_rows = pd.DataFrame([
    {"Model": "ElasticNet MinMax", "MAE": mae, "RMSE": rmse, "R2": r2},
    {"Model": "ElasticNet Std", "MAE": mae_st, "RMSE": rmse_st, "R2": r2_st}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [120]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LR scikit,4396241.0,2096.721381,0.354998
1,LR implementation,4396241.0,2096.721381,0.354998
2,Ridge scikit,4396074.0,2096.681606,0.355022
3,Ridge implementation,4396074.0,2096.681606,0.355022
4,Lasso scikit,4389241.0,2095.051634,0.356025
5,Lasso implementation,4396240.0,2096.721336,0.354998
6,ElasticNet scikit,4686857.0,2164.915034,0.31236
7,ElasticNet implementation,4396324.0,2096.741266,0.354986
8,LR MinMax,4396241.0,2096.721381,0.354998
9,LR Std,4396241.0,2096.721381,0.354998


In [121]:
X_basic = X[['bathrooms', 'bedrooms', 'interest_level']]
X_train, X_test, y_train, y_test = train_test_split(X_basic, y, test_size=0.21)
poly = PolynomialFeatures(degree=10, include_bias=False)
X_poly = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

In [122]:
lr = LinearRegression()
lr.fit(X_poly, y_train)
y_pred = lr.predict(X_poly_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

new_rows = pd.DataFrame([
    {"Model": "LR Polynomial", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [123]:
lr = Ridge()
lr.fit(X_poly, y_train)
y_pred = lr.predict(X_poly_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

new_rows = pd.DataFrame([
    {"Model": "Ridge Polynomial", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [124]:
lr = Lasso()
lr.fit(X_poly, y_train)
y_pred = lr.predict(X_poly_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

new_rows = pd.DataFrame([
    {"Model": "Lasso Polynomial", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

  model = cd_fast.enet_coordinate_descent(


In [125]:
lr = ElasticNet()
lr.fit(X_poly, y_train)
y_pred = lr.predict(X_poly_test)
mae = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

new_rows = pd.DataFrame([
    {"Model": "ElasticNet Polynomial", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

  model = cd_fast.enet_coordinate_descent(


In [126]:
test_mean = y_test.mean()
test_median = y_test.median()

In [127]:
mae = mean_squared_error(y_test, [test_mean] * len(y_test))
rmse = root_mean_squared_error(y_test, [test_mean] * len(y_test))
r2 = r2_score(y_test, [test_mean] * len(y_test))

new_rows = pd.DataFrame([
    {"Model": "Native mean", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [128]:
mae = mean_squared_error(y_test, [test_median] * len(y_test))
rmse = root_mean_squared_error(y_test, [test_median] * len(y_test))
r2 = r2_score(y_test, [test_median] * len(y_test))

new_rows = pd.DataFrame([
    {"Model": "Native median", "MAE": mae, "RMSE": rmse, "R2": r2}
])

metrics_df = pd.concat([metrics_df, new_rows], ignore_index=True)

In [129]:
metrics_df

Unnamed: 0,Model,MAE,RMSE,R2
0,LR scikit,4396241.0,2096.721,0.3549979
1,LR implementation,4396241.0,2096.721,0.3549979
2,Ridge scikit,4396074.0,2096.682,0.3550224
3,Ridge implementation,4396074.0,2096.682,0.3550224
4,Lasso scikit,4389241.0,2095.052,0.3560248
5,Lasso implementation,4396240.0,2096.721,0.354998
6,ElasticNet scikit,4686857.0,2164.915,0.3123596
7,ElasticNet implementation,4396324.0,2096.741,0.3549857
8,LR MinMax,4396241.0,2096.721,0.3549979
9,LR Std,4396241.0,2096.721,0.3549979


In [130]:
metrics_df.sort_values("MAE", ascending=True).head(5)

Unnamed: 0,Model,MAE,RMSE,R2
15,ElasticNet Std,4274007.0,2067.367213,0.372932
4,Lasso scikit,4389241.0,2095.051634,0.356025
12,Lasso MinMax,4391918.0,2095.690251,0.355632
13,Lasso Std,4392439.0,2095.814685,0.355556
3,Ridge implementation,4396074.0,2096.681606,0.355022


In [131]:
metrics_df.sort_values("RMSE", ascending=True).head(5)

Unnamed: 0,Model,MAE,RMSE,R2
15,ElasticNet Std,4274007.0,2067.367213,0.372932
4,Lasso scikit,4389241.0,2095.051634,0.356025
12,Lasso MinMax,4391918.0,2095.690251,0.355632
13,Lasso Std,4392439.0,2095.814685,0.355556
3,Ridge implementation,4396074.0,2096.681606,0.355022


### The Best model the stablest model and  is 15(ElasticNet StandartScaler) 
##### coz this model combines L1(Lasso) and L2(ridge) regulations and StandardScaler standardizes the features to have zero mean and unit variance.

## Bonus part

In [132]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [133]:
LR_poly = LinearRegression()
LR_poly.fit(X_poly, y_train)
y_pred = LR_poly.predict(X_poly_test)
rmse = root_mean_squared_error(y_test, y_pred)
LR_poly.fit(X_poly, y_train_log)
y_pred_log = LR_poly.predict(X_poly_test)
rmse_log = root_mean_squared_error(y_test, y_pred_log)
print(f"RMSE {rmse} VS Log RMSE {rmse_log}")

RMSE 6854962496.404244 VS Log RMSE 1046558.8048186457


In [134]:
Q1 = np.quantile(X_poly, 0.25)
Q3 = np.quantile(X_poly, 0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

mask = ~((X_poly < lower_bound) | (X_poly > upper_bound)).any(axis=1)

X_poly_clean = X_poly[mask]
y_train_clean = y_train[mask]

LR_poly.fit(X_poly_clean, y_train_clean)
y_pred = LR_poly.predict(X_poly_test)
rmse_clean = root_mean_squared_error(y_test, y_pred)
print(f"RMSE {rmse} VS RMSE clean {rmse_clean}")

RMSE 6854962496.404244 VS RMSE clean 1.7737209673386077e+22
