<h1 style='color:purple' align='center'>Data Science Regression Project: Predicting Home Prices in Canada</h1>

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

<h2 style='color:blue'>Data Load: Load Canada home prices into a dataframe</h2>

In [None]:
df1 = pd.read_csv('house_listings_data.csv',encoding='latin-1')


In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
df1.columns

In [None]:
df1['Province'].unique()

In [None]:
df1['Number_Beds'].value_counts()

In [None]:
df1['Number_Baths'].unique()

**Drop features that are not required to build our model**

In [None]:
df2 = df1.drop(['Province'],axis='columns')
df2.shape

<h2 style='color:blue'>Data Cleaning: Handle NA values</h2>

In [None]:
df2.isnull().sum()

In [None]:
df2.shape

In [None]:
df3 = df2.dropna()
df3.isnull().sum()

In [None]:
df3.shape

<h2 style='color:blue'>Feature Engineering</h2>

**There are rooms graeter 16 in one house**

In [None]:
df3_filter= df3[df3['Number_Beds']>=6]
df3_filter

In [None]:
df4=df3[df3['Number_Beds']<=6]

In [None]:
average_beds=df4['Number_Beds'].mean()
average_baths=df4['Number_Baths'].mean()

In [None]:
average_rnd_beds = round(average_beds)
average_rnd_baths = round(average_baths)

In [None]:
print("Average number of beds:", average_rnd_beds)
print("Average number of baths:", average_rnd_baths)

** Replace outliers Number of beds & Baths by Average of df3 dataset**

In [None]:
df3_filter['Number_Beds']=3

In [None]:
df3_filter['Number_Baths']=3

In [None]:
df3_filter

In [None]:
df4= pd.concat([df4,df3_filter],ignore_index= True)

In [None]:
df4.head()

In [None]:
# Replace bath outliers (>=6) with mean (rounded to 3)
df4.loc[df4['Number_Baths'] >= 6, 'Number_Baths'] = average_rnd_baths
print("Replaced Number_Baths >= 6 with:", average_rnd_baths)

# Quick check
print("Number_Baths counts after replacement:")
df4['Number_Baths'].value_counts()

# Show any remaining rows with Number_Baths >= 6 (should be none)
df4[df4['Number_Baths'] >= 6]

In [None]:
df5=df4.copy()

In [None]:
df5

In [None]:
df5.to_csv("bhp.csv",index=False)

**Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations**

In [None]:
df5.City = df5.City.apply(lambda x: x.strip())
location_stats = df5['City'].value_counts(ascending=False)
location_stats

In [None]:
location_stats.values.sum()

In [None]:
len(location_stats[location_stats<200])

In [None]:
len(location_stats)

In [None]:
len(location_stats[location_stats<=1000])

<h2 style="color:blue">Dimensionality Reduction</h2>

**Any location having less than 1000 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns**

In [None]:
location_stats_less_than_1000 = location_stats[location_stats<=1000]
location_stats_less_than_1000

In [None]:
len(df5.City.unique())

In [None]:
df5.City = df5.City.apply(lambda x: 'other' if x in location_stats_less_than_1000 else x)
len(df5.City.unique())

In [None]:
df5.head(10)

In [None]:
df6=df5.copy()

<h2 style='color:blue'>Outlier Removal Using Mean</h2>

In [None]:
df6.Price.describe()

In [None]:
df6[df6['Price']>20000000.0]

In [None]:
df6_filter = df6[df6['Price']>=20000000.0]
df6_filter


Replacing All Home Prices with 20M to Mean of data

In [None]:
mean_price = df6[df6['Price']<=20000000.0]['Price'].mean()
mean_price

In [None]:
df6_filter['Price']=mean_price
df6_filter

In [None]:
df7= pd.concat([df6[df6['Price']<=20000000.0],df6_filter],ignore_index=True)
df7.describe()

**Here we find that min price 21500 whereas max we still is 19880000.0, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation**

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('City'):
        m = np.mean(subdf.Price)
        st = np.std(subdf.Price)
        reduced_df = subdf[(subdf.Price>(m-st)) & (subdf.Price<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df8 = remove_pps_outliers(df7)
df8.shape

**Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like**

In [None]:
def plot_scatter_chart(df, city):
    br2 = df[(df.City == city) & (df.Number_Beds == 2)]
    br3 = df[(df.City == city) & (df.Number_Beds == 3)]
    matplotlib.rcParams['figure.figsize'] = (15, 10)
    plt.scatter(br2.Number_Baths, br2.Price, color='blue', label='2 BR', s=50)
    plt.scatter(br3.Number_Baths, br3.Price, marker='+', color='green', label='3 BR', s=50)
    plt.xlabel("Number of bathrooms")
    plt.ylabel("Price ($CAD)")
    plt.title(city)
    plt.legend()

plot_scatter_chart(df8, "Toronto")

In [None]:
plot_scatter_chart(df8,"Barrie")

In [None]:
plot_scatter_chart(df8, "Vancouver")

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.Price,rwidth=0.8)
plt.xlabel("Price")
plt.ylabel("Count")

<h2 style='color:blue'>Outlier Removal Using Bathrooms Feature</h2>

In [None]:
df8.Number_Baths.unique()

In [None]:
plt.hist(df8.Number_Baths,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [None]:
df8[df8.Number_Baths>10]

In [None]:
df9=df8.copy()
df9

<h2 style='color:blue'>Use One Hot Encoding For Location</h2>

In [None]:
dummies = pd.get_dummies(df9.City)
dummies.head(3)

In [None]:
df10 = pd.concat([df9,dummies.drop('other',axis='columns')],axis='columns')
df10.head()

In [None]:
df11 = df10.drop('City',axis='columns')
df11.head(2)

In [None]:
df12 = df11.drop('Address',axis='columns')
df12.head(2)

In [None]:
df12 = df12.drop(['Latitude','Longitude','Population','Median_Family_Income'],axis='columns')
df12.head(2)

<h2 style='color:blue'>Build a Model Now...</h2>

In [None]:
df12.shape

In [None]:
X = df12.drop(['Price'],axis='columns')
X.head(3)

In [None]:
X

In [None]:
X.shape

In [None]:
y = df12.Price
y.head(3)

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

<h2 style='color:blue'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h2>

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

**We can see that in 5 iterations we get a score above 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose**

<h2 style='color:blue'>Find best model using GridSearchCV</h2>

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'copy_X': [True, False],
                'fit_intercept': [True, False],
                'n_jobs': [None, -1],
                'positive': [True, False],
                'tol': [1e-4, 1e-3]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

**Based on above results we can say that dececion tree gives the best score. Hence we will use that.**

<h2 style='color:blue'>Test the model for few properties</h2>

In [None]:
def predict_price(City,Number_Bed,Number_Baths):    
    loc_index = np.where(X.columns==City)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = Number_Bed
    x[1] = Number_Baths
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
predict_price('Toronto',2, 2)

In [None]:
predict_price('Barrie',2, 2)

In [None]:
predict_price('Vancouver',2, 2)

<h2 style='color:blue'>Export the tested model to a pickle file</h2>

In [None]:
import pickle
with open('canada_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

<h2 style='color:blue'>Export location and column information to a file that will be useful later on in our prediction application</h2>

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

<h2 style='color:blue'>Add log-transformed Population and Median Family Income</h2>
We will automatically detect the population and median income columns (case-insensitive),
convert them to numeric, clip non-positive values to 0, and compute `log1p` features.
These features are then added to the model for improved accuracy, as price often
scales non-linearly with population and income.

In [None]:
# # Detect and create log features for population and median income
# import numpy as np
# import pandas as pd

# # Use the cleaned dataset previously prepared
# df_log = df9.copy()  # df9 exists earlier after cleaning steps

# def find_col(cols, patterns_any, patterns_all=None):
#     patterns_all = patterns_all or []
#     for c in cols:
#         s = c.lower().replace('_', ' ').strip()
#         if any(p in s for p in patterns_any) and all(p in s for p in patterns_all):
#             return c
#     return None

# # Try to find candidate columns (case-insensitive)
# pop_col = find_col(df_log.columns, patterns_any=['popul', 'pop'], patterns_all=[])
# inc_col = (
#     find_col(df_log.columns, patterns_any=['median'], patterns_all=['income'])
#     or find_col(df_log.columns, patterns_any=['income'], patterns_all=['median'])
#     or find_col(df_log.columns, patterns_any=['median'], patterns_all=['family', 'income'])
# )

# detected = {'population_column': pop_col, 'median_income_column': inc_col}
# print('Detected columns:', detected)

# # Create numeric versions and log1p features if available
# if pop_col is not None:
#     df_log[pop_col] = pd.to_numeric(df_log[pop_col], errors='coerce').fillna(0)
#     df_log[pop_col] = df_log[pop_col].clip(lower=0)
#     df_log['log_population'] = np.log1p(df_log[pop_col])
# else:
#     print('Population column not found; skipping log_population.')

# if inc_col is not None:
#     df_log[inc_col] = pd.to_numeric(df_log[inc_col], errors='coerce').fillna(0)
#     df_log[inc_col] = df_log[inc_col].clip(lower=0)
#     df_log['log_median_income'] = np.log1p(df_log[inc_col])
# else:
#     print('Median income column not found; skipping log_median_income.')

# # Quick sanity summary of new features if present
# for c in ['log_population', 'log_median_income']:
#     if c in df_log.columns:
#         print(c, 'summary:')
#         print(df_log[c].describe())

In [None]:
# # Rebuild CV pipeline including log features
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# df_model2 = df_log.copy()

# base_features = ['Number_Beds', 'Number_Baths', 'City']
# log_features = [c for c in ['log_population', 'log_median_income'] if c in df_model2.columns]
# feature_cols2 = base_features + log_features
# print('Using features:', feature_cols2)

# # Drop rows with any missing in selected features/target
# df_model2 = df_model2.dropna(subset=feature_cols2 + ['Price'])
# X2 = df_model2[feature_cols2]
# y2 = df_model2['Price']

# # One-hot for City; pass-through numeric (beds, baths, logs)
# try:
#     ohe2 = OneHotEncoder(handle_unknown='ignore', min_frequency=50)
# except TypeError:
#     ohe2 = OneHotEncoder(handle_unknown='ignore')

# preprocess2 = ColumnTransformer([
#     ('city', ohe2, ['City'])
# ], remainder='passthrough')

# models2 = {
#     'Linear': LinearRegression(),
#     'Ridge': Ridge(alpha=1.0),
#     'RandomForest': RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
#     'GBDT': GradientBoostingRegressor(random_state=42)
# }

# results2 = []
# cv2 = KFold(n_splits=5, shuffle=True, random_state=42)

# for name, base_model in models2.items():
#     pipe = Pipeline(steps=[('prep', preprocess2), ('reg', base_model)])
#     model = TransformedTargetRegressor(regressor=pipe, func=np.log1p, inverse_func=np.expm1)
#     scores = cross_val_score(model, X2, y2, scoring='r2', cv=cv2, n_jobs=-1)
#     results2.append({'model': name, 'mean_r2': scores.mean(), 'std_r2': scores.std(), 'all_scores': scores})

# pd.DataFrame(results2).sort_values('mean_r2', ascending=False)