# Property Price Prediction Project

# Problem Objective :

The project aims at building a model of housing prices to predict median house values in California using the provided dataset. This model should learn from the data and be able to predict the median housing price in any district, given all the other metrics.

Districts or block groups are the smallest geographical units for which the US Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). There are 20,640 districts in the project dataset.

In [None]:
import IPython
IPython.display.Image('https://i-media.vyaparify.com/vcards/blogs/95898/Buysell1.jpg')

# Step 1: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score #root_mean_squared_error
from sklearn.datasets import fetch_california_housing

warnings.filterwarnings('ignore')
print('Modules Loaded Successfully!!')

# Step 2 Load Data Create DF

In [None]:
# Internet Required For This Code to Run
data_dict = fetch_california_housing()

data_dict.keys()

In [None]:
print(data_dict['feature_names'])

In [None]:
print(data_dict['target_names'])

In [None]:
df = pd.DataFrame(data_dict['data'],
                  columns = data_dict['feature_names'])

df['MedHouseVal'] = data_dict['target']


print('Data Loaded Successfully!!')

# Step 3: Understanding Data using EDA

In [None]:
# Shape
df.shape

In [None]:
# info
df.info()

In [None]:
# Checking Null values
df.isna().sum()

In [None]:
print(data_dict['DESCR'])

In [None]:
# hundreds of thousands of dollars ($100,000) 1 lakh Dollars
# average number of household members
df.sample()

In [None]:
# All data Must be in Numerical, Dataset contains all values in numerical
# We can proceed this for Analysis

In [None]:
# Checking data Distribution

sns.pairplot(data = df,corner=True)
plt.show()

In [None]:
# Checking data Distribution

plt.title('Features vs target Corr')
sns.heatmap(df.corr().round(2), annot = True,cmap = 'mako')
# cmap = color_map ( -1 to 1)
plt.show()

In [None]:
# Data Describe()
df.describe()

In [None]:
plt.figure(figsize = (15,12))
for i,j in enumerate(df.columns):
  plt.subplot(3,3, i+1)
  plt.hist(df[j],color = 'blue', alpha = 0.3)
  plt.title(j+' Analysis')
plt.show()

In [None]:
# Distribution of target Value MedHousevalue vs MedInc

sns.jointplot(data = df, x = 'MedHouseVal', y = 'MedInc')
plt.show()

In [None]:
# sns.regplot(data = df, x = 'MedHouseVal', y = 'MedInc')
# plt.show()

In [None]:
df['MedHouseVal'].describe()
# 75 % of price less than 2.64 lakh dollars

# Step 4: Feature Engineering and Preprocessing

In [None]:
# Convert data to Same Scale for better prediction
# normalization = MinMaxScaler
from sklearn.preprocessing import MinMaxScaler


X = df.iloc[:,:-1]
y = df['MedHouseVal']



scaler = MinMaxScaler()

scaler.fit(X)  # Learning: Fit data in 0 to 1
X_scaled = scaler.transform(X)

print('Done')

In [None]:
print(X_scaled.min())
print(X_scaled.max())

# Step 5: Train test Split: Split Data for Training and testing Part

In [None]:
# Train test Split: Divide into train Part Test Part
# train_test_split: Func: divide


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=.2, random_state= 150 )
print('Done')

In [None]:
b,a,c,d = [23,534,6,65] # Unpacking
print(a)

In [None]:
import random
random.seed(56)
random.randint(1,50)

In [None]:
print('Shape of Xtrain',X_train.shape)
print('Shape of X_test',X_test.shape)
print('Shape of y_train',y_train.shape)
print('Shape of y_test',y_test.shape)


In [None]:
X_scaled.shape

In [None]:
20640*.8

In [None]:
df.shape

# Step 6: Model Building

###### Step 6.1 Linear Model Using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression() # Object Creation

lr_model.fit(X_train,y_train) # Trainig Model

###### Step 6.1.2: Model prediction

In [None]:
# Test data: X_test

y_pred = lr_model.predict(X_test)
print('Done')

In [None]:
# 20 %: Actual answer: y_test, Predcited: y_pred

lr_compare_df = pd.DataFrame({'Actual House price':y_test,
             'Predicted House price':y_pred})

print('Done')

In [None]:
lr_compare_df

In [None]:
lr_mae = mean_absolute_error(y_test,y_pred)
print('Lr MAE',lr_mae)

In [None]:
lr_mse = mean_squared_error(y_test,y_pred)
print('Lr MSE',lr_mse)

In [None]:
lr_rmse = lr_mse**.5

print('Lr RMSE',lr_rmse)

In [None]:
# Trainig Score
lr_train_score = lr_model.score(X_train,y_train)
print('Training Score',lr_train_score)

# testing Score
lr_test_score = lr_model.score(X_test,y_test)
print('testing Score',lr_test_score)


In [None]:
# Model Score: apx: 60%, Model not that much good, we need, more parameter tuning
# Alternate Model call
# : feature vs Fetaure high corr( X col: 8)

# Variance Inflation Factor> 10 we can drop that col
# Bias vs Variance Trade off: Intersect

# Training score: High, Testing Score: Low = Overfit
# Training score: Low, Testing Score: Low  = Underfit
# Training score: Good, Testing Score: Good  = Balancedfit


# Principal Component Analysis (PCA) is a dimensionality reduction technique
# PCA:  Imp Features(cols) Find: (8: 5 Imp)

![mlconcepts_image5.png](attachment:9d0800a9-9f47-458b-a328-7e12c2b68c12.png)

![testset.webp](attachment:54ad481d-491d-4654-92fd-4149bd3f5e13.webp)

In [None]:
# Lasso,Ridge :  Linear regression: Balanced fit model creation

![Bias_and_variance_contributing_to_total_error.svg.png](attachment:aae01608-98d4-4f30-8ab7-6a0512f687ff.png)

###### Step 6.1.3: checking VIF and Drop column if value vif>10

In [None]:
X_train_df = pd.DataFrame(X_train, columns = data_dict['feature_names'])
X_train_df.sample()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_df['intercept']  = 1

X_train_df.sample()

In [None]:
# X_train_df.shape[1]

In [None]:
# X_train_df.columns

In [None]:
# X_train_df.values

In [None]:
vif_df = pd.DataFrame()

vif_df['Features'] = list(data_dict['feature_names']) + ['Intercept']

vif_value = [variance_inflation_factor(X_train_df.values,i) for i in range(X_train_df.shape[1])]

vif_df['VIF factor'] = vif_value

vif_df.sort_values(by = 'VIF factor',ascending= False)

# Because Latitude and Longitude has high corr and vif close to 10 we can drop one or both

In [None]:
# X is an independent features

In [None]:
# print(['Hello'+str(i) for i in range(10)])
# # Run loop inside list:

In [None]:
col = list(X.columns)
col.remove('Latitude')

# print(col)

X_train_vif = X_train_df[col]
X_train_vif.sample()

![variance-inflation-factor.asp-Final-6cd8e4740c254821b0fa2ab057b5df88.jpg](attachment:25d879c8-fab7-46d3-928a-5ed08c635c7f.jpg)

In [None]:
def built_model(ml_model, col):

    new_X_train = pd.DataFrame(X_train,columns = data_dict['feature_names'])
    final_X_train = new_X_train[col]

    new_X_test = pd.DataFrame(X_test,columns = data_dict['feature_names'])
    final_X_test = new_X_test[col]

    machine_model = ml_model()
    machine_model.fit(final_X_train,y_train)

    model_y_pred = machine_model.predict(final_X_test)

    model_train_score = machine_model.score(final_X_train,y_train)
    model_test_score = machine_model.score(final_X_test,y_test)

    model_mae_error = mean_absolute_error(y_test,model_y_pred)
    model_mse_error = mean_squared_error(y_test,model_y_pred)
    model_rmse_error = model_mse_error**.5
    model_r2_score = r2_score(y_test,model_y_pred)

    model_metrics = {'Model Training Score':model_train_score,
                    'Model Test Score':model_test_score,
                    'MAE Error':model_mae_error,
                    'MSE Error':model_mse_error,
                    'RMSE Error':model_rmse_error,
                    'R2 Score':model_r2_score}

    model_matrix = pd.DataFrame(model_metrics,index = [1])

    return model_matrix,machine_model



In [None]:
ml_model = LinearRegression

built_model(ml_model,col)[0]

In [None]:
from sklearn.linear_model import Lasso, Ridge

In [None]:
ml_model = Lasso
# Lasso regression not giving much score, we can reject this model
built_model(ml_model,col)[0]

In [None]:
ml_model = Ridge

built_model(ml_model,col)[0]

In [None]:
print(col)

In [None]:
col = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

In [None]:
all_model = [LinearRegression, Ridge]

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

ss_X = sc.fit_transform(X)

i = 1
while True:
  X_train, X_test, y_train, y_test = train_test_split(ss_X, y,random_state=i,test_size=0.1)
  i += 1

  temp_df,final_model = built_model(Ridge,col)
  score = temp_df['Model Test Score'].values[0]

  print('Score is:',score)
  if score >= 0.6:
    print('Best random State',i)

    display(temp_df)
    break

  display(clear=True)

In [None]:
final_model

# Step 7: Model Save and Localhost Deployment

In [None]:
import pickle

with open('house_price_pred_ridge_model.pkl','wb') as f:
    pickle.dump(final_model,f)

with open('sc_scaler.pkl','wb') as f:
    pickle.dump(sc,f)

print('ML model and Scaller Saved Successfully!!')

In [None]:
#Ml Model: Client:
# Website: Input Value: Prediction
# Website: Python: Django, Flask, Fastapi, : Streamlit
# Streamlit: Python Web-based, Framework, Fast ML model test website
# Streamlit = Devlpoment + Deployment(For ML Engineers)

In [None]:
# pip install streamlit

In [None]:
import streamlit as st

# Title

col = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

st.title('California Housing Price Prediction')

st.image('https://nycdsa-blog-files.s3.us-east-2.amazonaws.com/2021/03/chaitali-majumder/house-price-497112-KhCJQICS.jpg')



st.header('Model of housing prices to predict median house values in California ',divider=True)

st.subheader('''User Must Enter Given values to predict Price:
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']''')


st.sidebar.title('Select House Features 🏠')

st.sidebar.image('https://png.pngtree.com/thumb_back/fh260/background/20230804/pngtree-an-upside-graph-showing-prices-and-houses-in-the-market-image_13000262.jpg')





In [None]:
for i in df[col]:
    min_value, max_value = df[i].agg(['min','max'])

    print('min',i,min_value)
    print('max',i,max_value)


In [None]:
# read_data
temp_df = pd.read_csv('california.csv')

for i in temp_df[col]:
    min_value, max_value = df[i].agg(['min','max'])

In [None]:
import streamlit as st
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler
import pickle

# Title

col = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

st.title('California Housing Price Prediction')

st.image('https://nycdsa-blog-files.s3.us-east-2.amazonaws.com/2021/03/chaitali-majumder/house-price-497112-KhCJQICS.jpg')



st.header('Model of housing prices to predict median house values in California ',divider=True)

st.subheader('''User Must Enter Given values to predict Price:
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']''')


st.sidebar.title('Select House Features 🏠')

st.sidebar.image('https://png.pngtree.com/thumb_back/fh260/background/20230804/pngtree-an-upside-graph-showing-prices-and-houses-in-the-market-image_13000262.jpg')


# read_data
temp_df = pd.read_csv('california.csv')

random.seed(52)

all_values = []

for i in temp_df[col]:
    min_value, max_value = temp_df[i].agg(['min','max'])

    var =st.sidebar.slider(f'Select {i} value', int(min_value), int(max_value),
                      random.randint(int(min_value),int(max_value)))

    all_values.append(var)

ss = StandardScaler()
ss.fit(temp_df[col])

final_value = ss.transform([all_values])

with open('house_price_pred_ridge_model.pkl','rb') as f:
    chatgpt = pickle.load(f)

price = chatgpt.predict(final_value)[0]


import time


progress_bar = st.progress(0)
placeholder = st.empty()
placeholder.subheader('Predicting Price')

if price>0:

    for i in range(100):
        time.sleep(0.05)
        progress_bar.progress(i + 1)

    body = f'Predicted Median House Price: ${round(price,2)} Thousand Dollars'
    placeholder.empty()
    # st.subheader(body)

    st.success(body)
else:
    body = 'Invalid House features Values'
    st.warning(body)













In [None]:
pandas==1.5.3
streamlit==1.37.1
random
time
pickle
scikit-learn==1.5.1