In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# Load data
df = pd.read_csv('house_price.csv')
X = df[['size', 'bedroom']]
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LINEAR REGRESSION
print("LINEAR REGRESSION")
print("-" * 40)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Coefficients
coeff_df = pd.DataFrame(lr_model.coef_, X.columns, columns=['Coefficient'])
print("Coefficients:")
print(coeff_df)
print(f"Intercept: {lr_model.intercept_:.2f}")

# Predictions and metrics
y_pred_lr = lr_model.predict(X_test)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_mape = mean_absolute_percentage_error(y_test, y_pred_lr)

print(f"\nMetrics:")
print(f"MAE: {lr_mae:.2f}")
print(f"MSE: {lr_mse:.2f}")
print(f"RMSE: {lr_rmse:.2f}")
print(f"MAPE: {lr_mape:.4f} ({lr_mape*100:.2f}%)")

# SGD REGRESSOR
print("\n\nSGD REGRESSOR")
print("-" * 40)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

sgd_model = SGDRegressor(max_iter=1000, random_state=42)
sgd_model.fit(X_train_scaled, y_train)

# Coefficients
sgd_coeff_df = pd.DataFrame(sgd_model.coef_, X.columns, columns=['Coefficient'])
print("Coefficients (scaled features):")
print(sgd_coeff_df)
print(f"Intercept: {sgd_model.intercept_[0]:.2f}")

# Predictions and metrics
y_pred_sgd = sgd_model.predict(X_test_scaled)
sgd_mae = mean_absolute_error(y_test, y_pred_sgd)
sgd_mse = mean_squared_error(y_test, y_pred_sgd)
sgd_rmse = np.sqrt(sgd_mse)
sgd_mape = mean_absolute_percentage_error(y_test, y_pred_sgd)

print(f"\nMetrics:")
print(f"MAE: {sgd_mae:.2f}")
print(f"MSE: {sgd_mse:.2f}")
print(f"RMSE: {sgd_rmse:.2f}")
print(f"MAPE: {sgd_mape:.4f} ({sgd_mape*100:.2f}%)")

# COMPARISON
print("\n\nMODEL COMPARISON")
print("-" * 40)
comparison = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'MAPE'],
    'LinearRegression': [lr_mae, lr_mse, lr_rmse, lr_mape],
    'SGDRegressor': [sgd_mae, sgd_mse, sgd_rmse, sgd_mape]
})
print(comparison.round(4))

# METRICS EXPLANATION
print("\n\nMETRICS TRADE-OFFS")
print("-" * 40)
print("MAE: Average absolute error, robust to outliers")
print("MSE: Penalizes large errors heavily, squared units")
print("RMSE: Same units as target, balance between MAE and MSE")
print("MAPE: Scale-independent percentage error, problems with zero values")

print(f"\nRMSE is preferred for regression as it:")
print("- Has same units as target variable")
print("- Penalizes large errors more than MAE")
print("- More interpretable than MSE")

LINEAR REGRESSION
----------------------------------------
Coefficients:
          Coefficient
size       143.218532
bedroom -13512.564426
Intercept: 84763.62

Metrics:
MAE: 72334.75
MSE: 8610424544.78
RMSE: 92792.37
MAPE: 0.1746 (17.46%)


SGD REGRESSOR
----------------------------------------
Coefficients (scaled features):
           Coefficient
size     106535.910237
bedroom  -10274.951289
Intercept: 323155.83

Metrics:
MAE: 72124.61
MSE: 8595003325.39
RMSE: 92709.24
MAPE: 0.1740 (17.40%)


MODEL COMPARISON
----------------------------------------
  Metric  LinearRegression  SGDRegressor
0    MAE      7.233475e+04  7.212461e+04
1    MSE      8.610425e+09  8.595003e+09
2   RMSE      9.279237e+04  9.270924e+04
3   MAPE      1.746000e-01  1.740000e-01


METRICS TRADE-OFFS
----------------------------------------
MAE: Average absolute error, robust to outliers
MSE: Penalizes large errors heavily, squared units
RMSE: Same units as target, balance between MAE and MSE
MAPE: Scale-independ

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Custom MAPE function since sklearn's version is in newer versions
def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculate Mean Absolute Percentage Error (MAPE)
    """
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# DATA LOADING AND PREPARATION
print("=" * 60)
print("HOUSE PRICE PREDICTION WITH MULTIPLE LINEAR REGRESSION")
print("=" * 60)

# Load the dataset
dataset = pd.read_csv('house_price.csv')
print(f"Dataset shape: {dataset.shape}")
print(f"Dataset info:")
print(dataset.info())
print(f"\nFirst few rows:")
print(dataset.head())
print(f"\nDataset statistics:")
print(dataset.describe())

# Prepare features and target
X = dataset[['size', 'bedroom']]  # Features: size (basement area) and bedroom count
y = dataset['price']              # Target: house price

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTrain set shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test set shapes:")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# ============================================================================
# MODEL 1: LINEAR REGRESSION
# ============================================================================
print("\n" + "=" * 60)
print("LINEAR REGRESSION MODEL")
print("=" * 60)

# Train Linear Regression model
lr_regressor = LinearRegression()
lr_regressor.fit(X_train, y_train)

# Display coefficients
lr_coeff_df = pd.DataFrame(lr_regressor.coef_, X.columns, columns=['Coefficient'])
print("\nLinear Regression Coefficients:")
print(lr_coeff_df)
print(f"Intercept: {lr_regressor.intercept_:.2f}")

# Make predictions
y_pred_lr = lr_regressor.predict(X_test)

# Calculate metrics
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_mape = mean_absolute_percentage_error(y_test, y_pred_lr)

print(f"\nLinear Regression Performance:")
print(f"MAE (Mean Absolute Error): {lr_mae:.2f}")
print(f"MSE (Mean Squared Error): {lr_mse:.2f}")
print(f"RMSE (Root Mean Squared Error): {lr_rmse:.2f}")
print(f"MAPE (Mean Absolute Percentage Error): {lr_mape:.2f}%")

# ============================================================================
# MODEL 2: SGD REGRESSOR
# ============================================================================
print("\n" + "=" * 60)
print("SGD REGRESSOR MODEL")
print("=" * 60)

# Scale features for SGD Regressor (important for gradient descent)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SGD Regressor
sgd_regressor = SGDRegressor(max_iter=1000, random_state=42)
sgd_regressor.fit(X_train_scaled, y_train)

# Display coefficients
sgd_coeff_df = pd.DataFrame(sgd_regressor.coef_, X.columns, columns=['Coefficient'])
print("\nSGD Regressor Coefficients (after scaling):")
print(sgd_coeff_df)
print(f"Intercept: {sgd_regressor.intercept_[0]:.2f}")

# Make predictions
y_pred_sgd = sgd_regressor.predict(X_test_scaled)

# Calculate metrics
sgd_mae = mean_absolute_error(y_test, y_pred_sgd)
sgd_mse = mean_squared_error(y_test, y_pred_sgd)
sgd_rmse = np.sqrt(sgd_mse)
sgd_mape = mean_absolute_percentage_error(y_test, y_pred_sgd)

print(f"\nSGD Regressor Performance:")
print(f"MAE (Mean Absolute Error): {sgd_mae:.2f}")
print(f"MSE (Mean Squared Error): {sgd_mse:.2f}")
print(f"RMSE (Root Mean Squared Error): {sgd_rmse:.2f}")
print(f"MAPE (Mean Absolute Percentage Error): {sgd_mape:.2f}%")

# ============================================================================
# MODEL COMPARISON
# ============================================================================
print("\n" + "=" * 60)
print("MODEL COMPARISON")
print("=" * 60)

comparison_df = pd.DataFrame({
    'Metric': ['MAE', 'MSE', 'RMSE', 'MAPE (%)'],
    'Linear Regression': [lr_mae, lr_mse, lr_rmse, lr_mape],
    'SGD Regressor': [sgd_mae, sgd_mse, sgd_rmse, sgd_mape]
})

print(comparison_df.to_string(index=False, float_format='%.2f'))

# Determine better model
if lr_rmse < sgd_rmse:
    print(f"\nBetter Model: Linear Regression (Lower RMSE: {lr_rmse:.2f} vs {sgd_rmse:.2f})")
else:
    print(f"\nBetter Model: SGD Regressor (Lower RMSE: {sgd_rmse:.2f} vs {lr_rmse:.2f})")

# ============================================================================
# PREDICTIONS COMPARISON
# ============================================================================
print("\n" + "=" * 60)
print("PREDICTIONS COMPARISON")
print("=" * 60)

predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Linear Regression': y_pred_lr,
    'SGD Regressor': y_pred_sgd,
    'LR Error': y_test.values - y_pred_lr,
    'SGD Error': y_test.values - y_pred_sgd
})

print("Sample predictions:")
print(predictions_df.head(10).to_string(index=False, float_format='%.2f'))

print("\n" + "=" * 80)
print("EVALUATION METRICS EXPLAINED")
print("=" * 80)

print("""
TRADE-OFFS BETWEEN EVALUATION METRICS:

1. MAE (Mean Absolute Error):
   - Measures average absolute difference between predicted and actual values
   - Units: Same as target variable (dollars in this case)
   - Pros: Easy to interpret, not heavily influenced by outliers
   - Cons: Doesn't penalize large errors heavily
   - Use when: Outliers shouldn't dominate the error assessment

2. MSE (Mean Squared Error):
   - Measures average squared difference between predicted and actual values
   - Units: Squared units of target variable (dollars² in this case)
   - Pros: Heavily penalizes large errors, mathematically convenient
   - Cons: Hard to interpret due to squared units, sensitive to outliers
   - Use when: Large errors are particularly undesirable

3. RMSE (Root Mean Squared Error):
   - Square root of MSE, bringing it back to original units
   - Units: Same as target variable (dollars in this case)
   - Pros: Easy to interpret, penalizes large errors more than MAE
   - Cons: Still sensitive to outliers
   - Use when: You want interpretable units but want to penalize large errors
   - Most commonly used metric for regression problems

4. MAPE (Mean Absolute Percentage Error):
   - Measures average absolute percentage difference
   - Units: Percentage (%)
   - Pros: Scale-independent, easy to interpret across different domains
   - Cons: Problems with zero values, asymmetric (penalizes over-prediction more)
   - Use when: You need scale-independent comparison or percentage-based interpretation

WHEN TO USE EACH METRIC:
- RMSE: Most common choice for regression, good balance of interpretability and error penalty
- MAE: When outliers shouldn't dominate, or when all errors should be weighted equally
- MAPE: When you need percentage-based interpretation or comparing across different scales
- MSE: Primarily used as loss function during training, less for evaluation

RMSE vs MAE Trade-off:
- RMSE ≥ MAE always (due to squaring)
- If RMSE >> MAE: Model has some large errors (outliers present)
- If RMSE ≈ MAE: Errors are relatively uniform
""")

HOUSE PRICE PREDICTION WITH MULTIPLE LINEAR REGRESSION
Dataset shape: (47, 3)
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   size     47 non-null     int64
 1   bedroom  47 non-null     int64
 2   price    47 non-null     int64
dtypes: int64(3)
memory usage: 1.2 KB
None

First few rows:
   size  bedroom   price
0  2104        3  399900
1  1600        3  329900
2  2400        3  369000
3  1416        2  232000
4  3000        4  539900

Dataset statistics:
              size    bedroom          price
count    47.000000  47.000000      47.000000
mean   2000.680851   3.170213  340412.659574
std     794.702354   0.760982  125039.899586
min     852.000000   1.000000  169900.000000
25%    1432.000000   3.000000  249900.000000
50%    1888.000000   3.000000  299900.000000
75%    2269.000000   4.000000  384450.000000
max    4478.000000   5.000000  6999