# **Importing the necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing the Boston Dataset
df = pd.read_csv("BostonHousingData.csv")
df.head()

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town.
* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per $10,000
* PTRATIO - pupil-teacher ratio by town B- 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT - % lower status of the population

* MEDV - Median value of owner-occupied homes in $1000's

# **Business Understanding**
The Boston housing dataset is majorly focused on predicting housing prices in Boston. The stakeholders to the Boston dataset couldn include Investors, Customers intrested in purchasing houses, job seeekers, housing competitors, and the government.

**Problem statement**
The goal is to develop a predictive model that accurately estimates housing prices in the Boston area. The problem arises from the complexity of various factors influencing the prices, such as crime rates, property tax, accessibility to highways, average number of rooms, and environmental quality. By building a reliable model, stakeholders can make informed decisions related to real estate investments, policy-making, market competition, and personal home purchases.

**Objectives**
1. Predicting housing prices using various models
2. Identify key factors influencing housing prices
3. Support decision nmaking for stakeholders
**Research Question**
What are the key factors that influence housing prices in Boston, and how accurately can we predict the prices of homes based on these factors?


**Access Situation**

In [None]:
df.info()

In [None]:
df.isnull().mean()*100

In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

**Continuous Data Plots**
* Histogram
* Scatter Plot
* Box Plot

In [None]:
# Correlation
df.corr()

**Correlation Interpretation**

Correlation ranges from -1 to +1
* -1 = Perfect negative Correlation
* -0.6 to -0.9 = High Negative Correlation
* -0.3 to -0.5 = Moderate negative correlation
* -0.1 to -0.2 = Low negative correlation

* 0 = No correlation

* +1 = Perfect positive Correlation
* +0.6 to +0.9 = High positive Correlation
* +0.3 to +0.5 = Moderate positive correlation
* +0.1 to +0.2 = Low positive correlation





In [None]:
plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot = True, cmap = "YlGnBu")

In [None]:
plt.figure(figsize = (10,6))
plt.scatter('RM', 'MEDV', data = df)
plt.xlabel("Number of Rooms")
plt.ylabel("Median Value of Houses")
plt.title("Relationship between Number of Rooms and the Median Value of the Houses")
plt.show()

Y = a + bX



In [None]:
plt.figure(figsize = (10,6))
plt.scatter('DIS', 'NOX', data = df)
plt.xlabel("Distance to 5 employment centres")
plt.ylabel("Nitric Polution")
plt.title("Relationship between the distance to 5 employment centers and the Nitric Polution")
plt.show()

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(df['AGE'])

In [None]:
df.hist(figsize = (15,8))

In [None]:
df_cleaned = df.copy()

## **Transforming the Data**
The data is not normally distributed

In [None]:
# Columns to transform
columns_to_transform = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'B', 'MEDV']

# Apply log transformation to the selected columns
for column in columns_to_transform:
    df[column] = np.log1p(df[column])

In [None]:
df.hist(figsize = (15,8))

## **Model Building**
### **Linear Regression Model**
**Import the necessary libraries**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Metrics for testing the model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
reg = LinearRegression()

In [None]:
df_x = df.drop(['MEDV'], axis = 1)
df_y = df['MEDV']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.20)

In [None]:
reg.fit(x_train, y_train)

In [None]:
y_predicted = np.round(reg.predict(x_test), 1)
y_predicted

In [None]:
np.round(np.array(y_test), 1)

In [None]:
print(reg.coef_)

In [None]:
print(reg.intercept_)

In [None]:
mean_squared_error(y_test, y_predicted)

In [None]:
r2_score(y_test, y_predicted)

In [None]:
reg.predict([[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) # Assuming all other 12 features are 0

## **Decision Trees**

In [None]:
df = df_cleaned.copy()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# Know the max_depth and min_samples_leaf needed for the decision tree
from sklearn.model_selection import GridSearchCV

# Checking for the perfomance metrics of the decision tree
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
df_x = df.drop(['MEDV'], axis = 1)
df_y = df['MEDV']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.20)

In [None]:
param_grid = {
    'max_depth': [1, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8]
}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)

In [None]:
reg = DecisionTreeRegressor(criterion = 'squared_error', random_state = 100, max_depth = 10, min_samples_leaf = 4)

In [None]:
reg.fit(x_train, y_train)

In [None]:
y_predicted = np.round(reg.predict(x_test), 1)
y_predicted

In [None]:
np.round(np.array(y_test), 1)

In [None]:
mean_squared_error(y_test, y_predicted)

In [None]:
r2_score(y_test, y_predicted)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize = (20,10))
plot_tree(reg, filled = True, max_depth = 3)
plt.show()

## **Random Forest**

In [None]:
df = df_cleaned.copy()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Library to use for the Hyperparameters for the RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Checking for the perfomance metrics of the decision tree
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
df_x = df.drop(['MEDV'], axis = 1)
df_y = df['MEDV']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.20)

In [None]:
# Knowing the hyperparameters for the Random Forest
hyper_parameters = RandomizedSearchCV(
    estimator=RandomForestRegressor(), # Add the estimator here
    param_distributions={ # Pass n_estimators within param_distributions
        'n_estimators': [100],
        'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)],
        'min_samples_split': [2, 6, 10],
        'min_samples_leaf': [1, 3, 4],
        'bootstrap': [True, False]
    }
)

hyper_parameters.fit(x_train, y_train)
print(hyper_parameters.best_params_)

In [None]:
forest = RandomForestRegressor(n_estimators = 100, max_depth = 10, min_samples_split = 2, min_samples_leaf = 1, bootstrap = True)

In [None]:
forest.fit(x_train, y_train)

In [None]:
y_predicted = np.round(reg.predict(x_test), 1)
y_predicted

In [None]:
np.round(np.array(y_test), 1)

In [None]:
mean_squared_error(y_test, y_predicted)

In [None]:
r2_score(y_test, y_predicted)

In [None]:
# Random Forest Plot
plt.figure(figsize = (20,10))
plot_tree(forest.estimators_[0], filled = True, max_depth = 3)
plt.show()

## **Support Vector Machine**

In [None]:
df = df_cleaned.copy()

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

# Library to use for the Hyperparameters for the Support Vector Regressor
from sklearn.model_selection import RandomizedSearchCV

# Checking for the perfomance metrics of the decision tree
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
df_x = df.drop(['MEDV'], axis = 1)
df_y = df['MEDV']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.20)

In [None]:
# Knowing the hyperparameters for the Random Forest
hyper_parameters = RandomizedSearchCV(
    estimator=SVR(), # Add the estimator here
    param_distributions={ # Pass n_estimators within param_distributions
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    }
)

#hyper_parameters.fit(x_train, y_train)
#print(hyper_parameters.best_params_)

In [None]:
svm_reg = SVR(kernel = 'linear', C = 10, gamma = 'scale')

In [None]:
svm_reg.fit(x_train, y_train)

In [None]:
y_predicted = np.round(reg.predict(x_test), 1)
y_predicted

In [None]:
np.round(np.array(y_test), 1)

In [None]:
mean_squared_error(y_test, y_predicted)

In [None]:
r2_score(y_test, y_predicted)