In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_california_housing
load_boston = fetch_california_housing()
X = load_boston.data
y = load_boston.target

data = pd.DataFrame(X , columns=load_boston.feature_names)
data['SalePrice'] = y
data.head()

In [None]:
print(load_boston.DESCR)

In [None]:
sns.pairplot(data , height=2.5)
plt.tight_layout()
sns.displot(data['SalePrice'])
print("Skewness:" , data['SalePrice'].skew())
print("Kurtosis:" , data['SalePrice'].kurt())

In [None]:
fig , ax = plt.subplots()
ax.scatter(x=data['MedInc'] , y=data['SalePrice'])
plt.ylabel('SalePrice' , fontsize=13)
plt.xlabel('MedInc' , fontsize=13)
plt.show()

In [None]:
fx  ,ax = plt.subplots()
ax.scatter(x=data['HouseAge'] , y = data['SalePrice'])
plt.ylabel('SalePrice' , fontsize=13)
plt.xlabel('HouseAge' , fontsize=13)
plt.show()

In [None]:
from scipy import stats
from scipy.stats import norm , skew

sns.distplot(data['SalePrice'] , fit=norm)
(mu , sigma) = norm.fit(data['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu , sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu , sigma)])
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(data['SalePrice'] , plot=plt)
plt.show()

In [None]:
# Log Transformation and Distribution Analysis of SalePrice
data['SalePrice'] = np.log1p(data['SalePrice'])
sns.distplot(data['SalePrice'] , fit=norm)
(mu , sigma) = norm.fit(data['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu , sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu , sigma)])
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(data['SalePrice'] , plot=plt)
plt.show()

In [None]:
# Data Correlation
plt.figure(figsize=(10 , 10))
cor = data.corr()
sns.heatmap(cor , annot=True , cmap=plt.cm.Reds)
plt.show()

In [None]:
# Extracting highly correlated feature from the heat map
cor_target = abs(cor['SalePrice'])
relevant_features = cor_target[cor_target>0.5]
names = [index for index , value in relevant_features.items()]
names.remove('SalePrice')
print(names)
print(len(names))

In [None]:
# MODEL BUILDING
from sklearn.model_selection import train_test_split
X = data.drop('SalePrice' , axis = 1)
y = data['SalePrice']
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state=42)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train , y_train)
predictions = lr.predict(X_test)
print("Actual value of the house" , y_test.to_list()[4])
print("Predicted value of the house" , predictions[4])

# Build the scatter plot between actual value and predicted value
plt.scatter(y_test , predictions)
plt.xlabel("Actual value")
plt.ylabel("Predicted value")
plt.title("Actual value vs Predicted value")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test , predictions)
print("MSE:" , mse)
print("RSME" , np.sqrt(mse))