In [1]:
### This script is to record the model development for the illustration of the prediction page

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib

In [3]:
data = pd.read_csv('../data/raw/listings.csv')

In [6]:
data["bathroom_adjusted"] = data["bathrooms_text"].str.extract(r'([0-9.]+)', expand = False).astype(float)
data["price_adjusted"] = data["price"].str.extract(r'([0-9.]+)', expand = False).astype(float)

In [7]:
data = data[['longitude','latitude','accommodates','room_type','beds','bathroom_adjusted','price_adjusted']]

In [46]:
# Step 1: Data Preparation
data_copy = data.dropna(subset=['price_adjusted']).copy()  # Drop rows with missing target variable
X = data_copy.drop(columns=['price_adjusted'])  # Independent variables
y = data_copy['price_adjusted']  # Dependent variable

# Step 2: Define the preprocessing steps
# Handle missing values
num_cols = X.select_dtypes(include='number').columns
cat_cols = ['room_type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])

# Step 3: Define the models with preprocessing in a Pipeline
linear_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

ridge_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Building and Evaluation
models = {'Linear Regression': linear_model, 'Ridge Regression': ridge_model, 'Random Forest Regression': rf_model}
best_model = None
best_mse = float('inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name}:")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    
    if mse < best_mse:
        best_model = model
        best_mse = mse

Linear Regression:
Mean Squared Error: 12414.222560145628
R-squared: 0.3773279680203394
Ridge Regression:
Mean Squared Error: 12440.863966672623
R-squared: 0.37599168951747
Random Forest Regression:
Mean Squared Error: 8105.981344444465
R-squared: 0.5934205424076717


In [42]:
# More cross validation with the best model Random Forest
# Step 3: Define the best model with preprocessing in a Pipeline
model_for_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Step 5: Perform cross-validation with the best model
cv_scores = cross_val_score(model_for_cv, X, y, cv=5, scoring='r2')
cv_r2_scores = cv_scores

print("Cross-Validation R2 Scores:", cv_r2_scores)
print("Mean R2:", cv_r2_scores.mean())

Cross-Validation R2 Scores: [0.53427625 0.53558056 0.69076289 0.70898619 0.66815026]
Mean R2: 0.6275512276591882


In [43]:
# Very simple hyperparamter tuning, my computer does not have that much resources
# Step 3: Define the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Step 5: Define the hyperparameter grid
param_grid = {
    'regressor__n_estimators': [100, 200],  # Number of trees in the forest
    'regressor__max_depth': [None, 10]  # Maximum depth of the trees
}

# Step 6: Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Step 7: Best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Step 8: Evaluate the best model on the test set
y_pred = best_estimator.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nBest Model Performance:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 200}

Best Model Performance:
Mean Squared Error: 8109.2777470813635
R-squared: 0.5932552015884462


In [45]:
import joblib

# Save the best best_estimator
joblib.dump(best_estimator, '../models/price_model.pkl')  # It seems this model is too big, I will change back to the default RF

['../models/price_model.pkl']

In [47]:
# Save the best model
joblib.dump(best_model, '../models/price_model.pkl')  # It seems this model is too big, I will change back to the default RF

['../models/price_model.pkl']

In [13]:
data.describe(include='all')

Unnamed: 0,longitude,latitude,accommodates,room_type,beds,bathroom_adjusted,price_adjusted
count,19741.0,19741.0,19741.0,19741,19620.0,19698.0,18901.0
unique,,,,4,,,
top,,,,Entire home/apt,,,
freq,,,,16025,,,
mean,-123.112161,49.262765,3.54597,,1.933792,1.351025,201.743664
std,0.037972,0.020825,2.057082,,1.180723,0.689773,141.954122
min,-123.221859,49.20296,1.0,,1.0,0.0,1.0
25%,-123.130981,49.2502,2.0,,1.0,1.0,110.0
50%,-123.115493,49.26906,3.0,,2.0,1.0,160.0
75%,-123.090034,49.279,4.0,,2.0,1.5,250.0


In [19]:
new_df = pd.DataFrame(
    {"longitude": -123.105090,
     "latitude": 49.247730,
     "accommodates": 4,
     "room_type": 'Entire home/apt',
     "beds": 3.0,
     "bathroom_adjusted": 2.0},
     index=[0]
)

In [20]:
new_df

Unnamed: 0,longitude,latitude,accommodates,room_type,beds,bathroom_adjusted
0,-123.10509,49.24773,4,Entire home/apt,3.0,2.0


In [22]:
    new_data = pd.DataFrame({
        'longitude': [-123.105090],
        'latitude': [49.247730],
        'accommodates': [4],
        'room_type': ['Entire home/apt'],
        'beds': [3.0],
        'bathroom_adjusted': [2.0]
    })

In [23]:
new_data

Unnamed: 0,longitude,latitude,accommodates,room_type,beds,bathroom_adjusted
0,-123.10509,49.24773,4,Entire home/apt,3.0,2.0
