# developed by: Vanshika Maithani
# 4th Semester
# Graphic Era Hill University
# Topic : House Price Prediction

# IMPORTING REQUIRED PACKAGES

In [1]:
import pandas as pd 
import numpy as np  
import seaborn as sb   #for graphs,plots
import matplotlib.pyplot as plt  #data visualization
# sklearn/ scikit-learn : implement various machine learning models for regression, classification, clustering
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler  
from sklearn.linear_model import LinearRegression  
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor   
from sklearn.ensemble import RandomForestRegressor  
from sklearn import metrics   # For evaluating model performance

# LOADING DATASET

- The dataset is California Housing Prices Data (5 new features!).  
- Median house prices for California districts derived from the 1990 census.
- It is a modified version of the California Housing Data used in the paper Pace, R. Kelley, and Ronald Barry.
- Loading the dataset with the help of pandas.

In [None]:
dataset = pd.read_csv("California_Houses.csv")  

# TAKING A LOOK AT THE DATASET

In [None]:
dataset.head()

In [None]:
dataset.shape   

__We can see that each row has 14 different attributes:__

    -> Median_House_Value        :  It has median house value within a block (measured in US Dollars).
    -> Median_Income             :  It describes the median income for households (measured in 10000 of US Dollars).
    -> Median_Age                :  It describes about the median age of a house within a block.
    -> Tot_Rooms                 :  It describes the total number of rooms in a block.
    -> Tot_Bedrooms              :  It describes the total number of bedrooms in a block.
    -> Population                :  It shows the total number of people residing within a block.
    -> Households                :  It describes the number of people residing within a home unit, for a block.
    -> Latitude                  :  A measure of latitude for a house.
    -> Longitude                 :  A measure of longitude for a house.
    -> Distance_to_coast         :  Distance to the nearest coast point.
    -> Distance_to_LA            :  Distance to the centre of Los Angeles.
    -> Distance_to_SanDiego      :  Distance to the centre of San Diego.
    -> Distance_to_SanJose       :  Distance to the centre of San Jose.
    -> Distance_to_SanFrancisco  :  Distance to the centre of San Francisco.

In [None]:
dataset.info() 
# function provides a summary of the DataFrame,
#including the column names, the data types of each column, the number of non-null values, and memory usage.

__We can see that:__
- There are 20640 instances in the dataset.
- There are no missing values.
- All the values are numeric (float or int).

In [None]:
dataset.describe() 
#it gives statistical summary of the numerical column

__This shows the statistical summaries of our dataset.__

In [None]:
fig, axis = plt.subplots(6, 2, figsize=[20, 40])
#This line creates a figure with 6 rows and 2 columns of subplots, and sets the size of the figure to 20 inches in width and 40 inches in height. 
#The fig variable represents the entire figure,
#and the axis variable is a 2-dimensional array representing the individual subplots.
sb.histplot(dataset, x="Median_House_Value", ax=axis[0,0])
sb.histplot(dataset, x="Median_Income", ax=axis[0,1])
sb.histplot(dataset, x="Median_Age", ax=axis[1,0])
sb.histplot(dataset, x="Tot_Rooms", ax=axis[1,1])
sb.histplot(dataset, x="Tot_Bedrooms", ax=axis[2,0])
sb.histplot(dataset, x="Population", ax=axis[2,1])
sb.histplot(dataset, x="Households", ax=axis[3,0])
sb.histplot(dataset, x="Distance_to_coast", ax=axis[3,1])
sb.histplot(dataset, x="Distance_to_LA", ax=axis[4,0])
sb.histplot(dataset, x="Distance_to_SanDiego", ax=axis[4,1])
sb.histplot(dataset, x="Distance_to_SanJose", ax=axis[5,0])
sb.histplot(dataset, x="Distance_to_SanFrancisco", ax=axis[5,1])
#These lines use the sb.histplot() function from seaborn to create histograms of 
#different columns from the dataset DataFrame.


In [None]:
sb.scatterplot(data=dataset, x="Longitude", y="Latitude")
#function from the seaborn library to create a scatter plot of the geographical coordinates (longitude and latitude) 










__This is the graphical visulisation of the dataset attributes.__
- We observe that the attributes have very different scales so we need to perform feature scaling.
- Many histograms are tail heavy i.e. they extend much farther to the right of the median than to the left. This may make it a bit harder for some Machine Learning algorithms to detect patterns. We need to transform these to have more bell-shaped distributions.

# CORRELATION MATRIX

In [None]:
matrix = dataset.corr(method="pearson")  #this gives correlation between various data.(+ve or -ve)

fig, ax = plt.subplots(figsize=(15, 10))  #The fig variable represents the entire figure, 
#and the ax variable represents the axis object.

sb.heatmap(matrix,annot=True, linewidths=0.5, ax=ax)

#heatmap is a representation of how strongly the variables are related to each other.
#with positive correlations represented by warmer colors (e.g., red)
#and negative correlations represented by cooler colors (e.g., blue).  

# SPLITTING DATA FOR TRAINING / TESTING

In [None]:
dataset_features = dataset.drop("Median_House_Value", axis=1)
dataset_labels = dataset["Median_House_Value"].copy()

x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(dataset_features, dataset_labels, train_size=0.7, random_state = 0)
#It separates the features (input variables) and the labels (target variable) from the dataset DataFrame.

In [None]:
x_data_test #30% of test data

In [None]:
y_data_train #labels for 70% of training data

In [None]:
y_data_test  #labels for 30% of test data

# APPLYING FEATURE SCALING

In [None]:
sc = StandardScaler()
x_data_train.iloc[:, :] = sc.fit_transform(x_data_train.iloc[:, :])
x_data_test.iloc[:, :] = sc.transform(x_data_test.iloc[:, :])

#The code you provided applies feature scaling to the training and testing sets using the StandardScaler from scikit-learn.
#By performing feature scaling, you ensure that the features in both the training and testing sets have a similar scale, 
#The StandardScaler scales the features such that they have zero mean and unit variance, 

In [None]:
x_data_train.describe()  #scaled data summary 

# APPLYING ALGORITHMS

## Linear Regression

In [None]:
linear_model = LinearRegression()

linear_model.fit(x_data_train, y_data_train)

predicted_price = linear_model.predict(x_data_test)
s

__Now our model has been trained on the training dataset.__ <br>
__Predicting and evaluating the performance of the model.__

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_data_test, predicted_price))
print('Mean Squared Error:', metrics.mean_squared_error(y_data_test, predicted_price))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_data_test, predicted_price)))

__The lesser the values for these metrics, the better is the performance of the algorithms.__ <br>
__Let us move to other algorithms.__

## Decision Tree Regression

In [None]:
dt_model = DecisionTreeRegressor()

dt_model.fit(x_data_train, y_data_train)
#This line fits the decision tree regression model to the training data.

predicted_price = dt_model.predict(x_data_test)
#takes the testing features as input and returns the predicted target values based on the learned decision tree.

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_data_test, predicted_price))
print('Mean Squared Error:', metrics.mean_squared_error(y_data_test, predicted_price))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_data_test, predicted_price)))

## K-Nearest Neighbors Regression

In [None]:
knn_model = KNeighborsRegressor()

knn_model.fit(x_data_train, y_data_train)

predicted_price = knn_model.predict(x_data_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_data_test, predicted_price))
print('Mean Squared Error:', metrics.mean_squared_error(y_data_test, predicted_price))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_data_test, predicted_price)))

__This is better from above models but it is not a great score.__<br>
__The Median_House_Value ranges between 14999 to 500000 so this typical prediction error is not very satisfying.__<br>
__Let us move to other algorithms.__

## Random Forest Regression

In [None]:
rf_model = RandomForestRegressor()

rf_model.fit(x_data_train.values, y_data_train)


predicted_price = rf_model.predict(x_data_test.values)


In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_data_test, predicted_price))
print('Mean Squared Error:', metrics.mean_squared_error(y_data_test, predicted_price))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_data_test, predicted_price)))

# Lower values of MAE, MSE, and RMSE indicate better predictive accuracy

In [None]:
# Assume the trained random forest regression model (rf_model) is already available

# Custom input
custom_input = [[8.3252,41,880,129,322,126,37.88,-122.23,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767]]  # Example custom input, modify as needed

# Predict the target variable
predicted_value = rf_model.predict(custom_input)

# Print the predicted value
print("Predicted Value: $",predicted_value)


__This is much Better.__

In [None]:
import pickle
#The pickle module in Python is used for object serialization and deserialization. 
#It allows you to save Python objects to disk in a binary format and later load them back into memory.

In [None]:
#saving the model as .pkl file

pkl_filename = "model.pkl" #This line defines the filename (model.pkl) that you want to use to save the trained model.

with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)
    
#This block of code opens the file specified by pkl_filename in write binary mode ('wb').
    

In [None]:
#checking the saved model accuracy
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)   # load the model from the model.pkl file into the pickle_model object.
score = pickle_model.score(x_data_test.values, y_data_test) #calculation of accuracy on the test data and its labels
#score: method returns the coefficient of determination R-squared of the model,
# A higher R-squared value indicates a better fit of the model to the data.
print(score)

# CONCLUSION

In this study the four machine learning regression algorithms __Linear Regression__, __Decision Tree Regression__, __K-Nearest neighbour__ and __Random forest regression__ have been compared when trained and tested with the dataset.

This has been done in order to study how __accurately__ they, as machine learning methods, predict the prices for the house pricing problem.

I have found that the __Random forest regression algorithm performs better__ at predicting house prices than other with regards to all of the error metrics.