<a href="https://colab.research.google.com/github/SarkarPriyanshu/USHousingMarketAnalysis/blob/main/03_Model_Development_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install -q feature-engine

In [59]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from feature_engine.transformation import BoxCoxTransformer
from sklearn.linear_model import LinearRegression,BayesianRidge,LassoLars,SGDRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


In [21]:
df = pd.read_csv('/content/USHousing',parse_dates=['DATE']).iloc[:,2:]

In [22]:
df.shape

(86, 8)

In [23]:
df.sample(5)

Unnamed: 0,CSUSHPISA,EVACANTUSQ176N,GDP,INTDSRUSM193N,MSACSR,PERMIT,UMCSENT,MSPUS
47,136.607,18476.0,16068.805,0.75,5.3,712.0,75.0,238400.0
83,239.559,15602.0,22600.185,0.25,4.0,1866.0,79.0,369800.0
48,139.154,18519.0,16207.115,0.75,4.9,732.0,76.4,238700.0
18,155.75,15361.0,12527.214,2.75,3.9,2097.0,91.7,228800.0
11,128.461,14908.0,11174.129,2.25,4.0,1808.0,82.4,186000.0


# Model Development

In [38]:
# Splitting the dataset into features (X) and target variable (y)

data_boxcox, lambda_boxcox = stats.boxcox(df['CSUSHPISA'])
X, y = df.drop('CSUSHPISA', axis=1), data_boxcox

# Splitting the data into training and testing sets
# The test set will be 30% of the data, using a random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60, 7), (26, 7), (60,), (26,))

In [62]:
models_list = [{'LinearRegression':LinearRegression()},{'BayesianRidge':BayesianRidge()},{'LassoLars':LassoLars()},{'SGDRegressor':SGDRegressor()},{'SVR':SVR()}]

In [63]:
# Initialize an empty list to store results
results_list = list()

# Loop through models_list
for model_info in models_list:
  # Create a pipeline for each model
  pipe = Pipeline([
        # Apply BoxCoxTransformer to variables in df columns
        ('BoxCoxTransformer', BoxCoxTransformer(variables=list(df.columns[1:]))),
        # Standardize the features
        ('StandardScaler', StandardScaler()),
        # Add the specific model to the pipeline
        (list(model_info.keys())[0], list(model_info.values())[0])  # model_info[0] is the model name, model_info[-1] is the model object
  ])

  # Fit the pipeline on training data
  pipe.fit(X_train, y_train)

  # Make predictions on training and test data
  X_train_predictions = pipe.predict(X_train)
  X_test_predictions = pipe.predict(X_test)

  # Store model name and its performance metrics in results_list
  results_list.append({
        'Model Name': list(model_info.keys())[0],
        'Train Data R2': r2_score(y_train, X_train_predictions),
        'Test Data R2': r2_score(y_test, X_test_predictions),
        'R2 Diff': r2_score(y_train, X_train_predictions) - r2_score(y_test, X_test_predictions),
        'Train Data RMSE': mean_squared_error(y_train, X_train_predictions,squared=True),
        'Test Data RMSE': mean_squared_error(y_test, X_test_predictions,squared=True),
        'RMSE Diff': mean_squared_error(y_train, X_train_predictions,squared=True) -  mean_squared_error(y_test, X_test_predictions,squared=True)
  })


In [64]:
# Initializing variables to track the best models and their respective metrics

# Loop through the results_list to analyze model performance
for result in results_list:
    # Display performance metrics for each model
    print(f"Name: {result['Model Name']:{25}} TrainR2: {np.round(result['Train Data R2'],2)}, TestR2: {np.round(result['Test Data R2'],2)}, DiffR2: {np.round(result['R2 Diff'],2)}")
    print(f"Name: {result['Model Name']:{25}} TrainRMSE: {np.round(result['Train Data RMSE'],2)}, TestRMSE: {np.round(result['Test Data RMSE'],2)}, DiffRMSE: {np.round(result['RMSE Diff'],2)}")
    print()
    print()


Name: LinearRegression          TrainR2: 0.97, TestR2: 0.95, DiffR2: 0.02
Name: LinearRegression          TrainRMSE: 0.01, TestRMSE: 0.01, DiffRMSE: -0.0


Name: BayesianRidge             TrainR2: 0.97, TestR2: 0.95, DiffR2: 0.02
Name: BayesianRidge             TrainRMSE: 0.01, TestRMSE: 0.01, DiffRMSE: -0.0


Name: LassoLars                 TrainR2: 0.0, TestR2: -0.07, DiffR2: 0.07
Name: LassoLars                 TrainRMSE: 0.24, TestRMSE: 0.2, DiffRMSE: 0.05


Name: SGDRegressor              TrainR2: 0.94, TestR2: 0.87, DiffR2: 0.08
Name: SGDRegressor              TrainRMSE: 0.01, TestRMSE: 0.02, DiffRMSE: -0.01


Name: SVR                       TrainR2: 0.98, TestR2: 0.96, DiffR2: 0.02
Name: SVR                       TrainRMSE: 0.01, TestRMSE: 0.01, DiffRMSE: -0.0




<blockquote style="background-color: #f7dc6f; padding: 10px; border-radius: 5px;">
  <h2 style="color: #333;">Observation Of Model Development</h2>
  <ul>
    <li><strong>Question: Why not use tree-based or deep neural network?</strong>
      <ul>
        <li>We do not want to draw decision boundaries because, as we saw while analyzing things like GDP, Median Sales, or Vacant, or any other aspect affects our Target columns.</li>
        <li>The point here is in the real world, we can't say things go as we expected. That's why we want our model to learn from real-time data observations and predict outcomes based on that.</li>
        <li>In tree-based models, we draw decision boundaries that are good for scenarios with limited possibilities.</li>
        <li>Then what is wrong with Neural Network? We want to know which factors affect more or less in predicting outcomes, which is not known if we work with neural networks.</li>
        <li>Hence, we go with Linear Models. The advantage is they are less complex, and we say there is a linear correlation between the target and independent columns.</li>
        <li>We are going to experement with `Linear Regressor`, `SVM` and `BayesianRidge`</li>
      </ul>
    </li>
  </ul>
</blockquote>
