**Project Description**:

The goal of this project is to predict house price based on real estate features extracted for houses in Bengaluru. The major sequence of steps in the coding process are listed below:

1.   Load and clean up the data.
2.   Analyze the features (*ordinal, categorical*, and *continuous*) for missingness, correct data type association, and distribution.
3.   Spit the data into train and test sets..
4.   Build an ML pipeline (*encoder -> imputer -> regression module*) for the train set by specifying seperate encoding and imputing steps (if needed) for ordinal, categorical, and continuous features.
5.   Train the ML pipeline on the train set and apply it for prediction on the test set.
6.   Report performance metrics

---

Import libraries

---

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

## Pipeline module
from sklearn.pipeline import Pipeline

## Scaling, encoding, and imputation libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

# Column trandformation library
from sklearn.compose import ColumnTransformer

## Train-test, cross-validation, and grid search modules
from sklearn.model_selection import train_test_split

## Cross-validation and grid search modules
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, cross_val_score

## Regression modules
from sklearn import linear_model

## Performance metrics modules
from sklearn.metrics import mean_squared_error, r2_score

---

Mount Google Drive if running in Colab

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2024MAHE'
    DATA_DIR = DIR + '/Data/'
    os.chdir(DIR)
else:
    DATA_DIR = 'Data/'

---

Load bengaluru house price data

---

In [None]:
## Load Bengaluru house price data
file = DATA_DIR+'houseprices.csv'
df= pd.read_csv(file, header = 0)

print('Bengaluru house price dataset')
print('-----------')
print('Initial number of samples = %d'%(df.shape[0]))
print('Initial number of features = %d\n'%(df.shape[1]))
df.head(5)

---

Clean up data

----

In [None]:
## Clean up data
# Remove 'title' column
df.drop(?, axis = ?, inplace = True)

# Retain only numerical values in area column
df['area'] = df[?].apply(lambda x:float(x.split(' ')[0].replace(',', '')))

# Retain only numerical values in rent column
def rent_column_modify(val):
  if 'Lacs' in val:
    return(float(val.split(' ')[0].split('/')[0].replace(',', ''))*1e5)
  else:
    return(float(val.split('/')[0].replace(',', '')))
df['rent'] = df['rent'].apply(?)

# Retain only numerical values in the price_per_sqft column
df[?] = df[?].apply(lambda x:float(?))

# Retain onlt numerical values in BHK columns
df[?] = df[?].apply(lambda x: int(?))

# Change 'Don't Know' entries in 'facing' column to NaN
df['facing'] = df['facing'].apply(lambda x: x if x != "?" else ?)

# Change 'None' entries in 'parking' column to Nan
df['parking'] = df['parking'].apply(lambda x: ?)

df.head(5)

---

Plot percentage of missing values (NaNs) for each feature

---

In [None]:
## Plot percentage of missing values (NaNs) for each feature
cutoff = 10 # we will remove features missing in more than cutoff% of the samples
fig = plt.figure(figsize=(4, 4))
percent_missing = (df.?().sum() / df.shape[?]) * 100
percent_missing.plot(kind = 'bar', color = cm.rainbow(np.linspace(0, 1, 2))[(percent_missing <= cutoff).values.astype(int)])
plt.plot(np.arange(df.shape[1]), np.repeat(cutoff, df.shape[1]), 'g--')
fig.suptitle('Percentage Missing Values Across All Features', fontsize = 10)
plt.xlabel('Feature', fontsize = 8)
plt.ylabel('% Missing Values', fontsize = 8);

---

Create lists of ordinal, categorical, and continuous features

---

In [None]:
## Create lists of ordinal, categorical, and continuous features
ordinal_features = [?, ?]
categorical_features = [?, ?, ?]
continuous_features = (df.drop(?, axis = 1)).drop(?, axis = 1).columns.tolist()

---

Assign 'category' datatype to ordinal and categorical columns

---

In [None]:
## Assign 'category' datatype to ordinal and categorical columns
print(df.dtypes)
df[ordinal_features + categorical_features] = df[? + ?].astype(?)
print('----')
df.dtypes

---

Print unique values in each ordinal and categorical features

---

In [None]:
## Print unique values in each ordinal and categorical features
print(df[ordinal_features + categorical_features].nunique())
print('\nUnique values in ordinal and categorical features')
print('---------------------------------------------------')
unique_values = {col:list(df[?].unique()) for col in ? + ?}
for key, value in ?.items():
  print(key, ?)

---

Remove the target variable column from the list of continuous features

---

In [None]:
## Remove the target variable column from the list of continuous features
continuous_features.remove(?)

---

Plot the distributions of the features

---

---

Train-test split of the dataset

---

In [None]:
## Train and test split of the data
X = df.drop('price_per_sqft', axis = ?)
y = ?
X_train, X_test, y_train, y_test = train_test_split(X, ?,
                                                    test_size = 0.1,
                                                    random_state = 1)
print('# training samples = %d, # test samples = %d'%(X_train.shape[?], ?))

---

Build pipeline for ordinal, categorical, and continuous features

---

In [None]:
## Build pipeline for ordinal, categorical, and continuous features

# Pipeline object for ordinal features
ordinal_transformer = Pipeline(steps = [('ordinalenc', OrdinalEncoder())])

# Pipeline object for categorical (features
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(missing_values = ?, strategy = ?)), ('onehotenc', ?(handle_unknown = 'ignore'))])

# Pipeline object for continuous features
continuous_transformer = Pipeline(steps = [('scaler', ?)])

# Create a preprocessor object for all features
preprocessor = ColumnTransformer(transformers = [('continuous', continuous_transformer, continuous_features),
                                                 ('categorical', categorical_transformer, categorical_features),
                                                 ('ordinal', ordinal_transformer, ordinal_features)
                                                ],
                                 remainder = 'passthrough'
                                 )

# Define a classifier object
regressor = linear_model.LinearRegression()

# Define the entire classification model pipeline
model_pipeline = Pipeline(steps = [('preprocessor', ?), ('regressor', ?)])

---

Fit the pipeline on the train data and test on the test data

---

In [None]:
## Fit the model pipeline on the train data and test on the test data
model_pipeline.fit(?, ?)
y_pred = model_pipeline.?(X_test)
print('Mean Squared Error : {}'.format(mean_squared_error(?, ?)))
print('r2_score : {}'.format(?(y_test, y_pred)))