In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **House Price Prediction**

In [None]:
# importing all the required libreries
import numpy as np
import pandas as pd
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# loading the dataset
df = pd.read_csv('../input/the-boston-houseprice-data/boston.csv')

In [None]:
df.head()

In [None]:
# peaking at the diffrent labels in the dataset

df.columns

In [None]:
# checking the number of rows and columns of the dataset

df.shape 

In [None]:
# checking for duplicate data

df.duplicated().sum() 

In [None]:
# getting the information about dataframe

df.info() 

In [None]:
# checking for columns which has null values

df.isnull().sum()

## **Checking for Correlation**

In [None]:
# correlation matrix

correlation=df.corr()
correlation

In [None]:
# correlation with price

correlation["MEDV"].sort_values(ascending=False)

> ***Hear we can see that RM, LSTAT and PRRATIO have a strong correlation with MEDV(PRICE)***

In [None]:
# correlation using heatmap

plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot = True, cmap='coolwarm')

In [None]:
# correlation using scatter matrix

from pandas.plotting import scatter_matrix 
attributes = ["MEDV","LSTAT","RM","PTRATIO","INDUS","ZN"]
sns.pairplot(df[attributes],kind="scatter")

> ***Hear we can see LSTAT and RM have a good relation with MEDV***

In [None]:
# plotting RM vs MEDV

sns.scatterplot(x="RM", y="MEDV", data=df)

In [None]:
# plotting LSTAT vs MEDV

sns.scatterplot(x="LSTAT", y="MEDV", data=df)

In [None]:
df.head()

In [None]:
# defining a function for plotting the histograms

def hst(dfrm):    
    sns.set_palette("pastel")
    sns.histplot(data=dfrm)
    plt.title("Housing Price")
    plt.show()

# Feature engineering

## Checking the skewness

In [None]:
# plotting for MEDV

hst(df["MEDV"]) 

In [None]:
# testing which transformation can give us a more scimmetric skew

MEDVS=pd.DataFrame({'medvs':df['MEDV'],'log(medvs+1)':np.log1p(df['MEDV']), 'sqrt(medvs)' : np.sqrt(df['MEDV'])})
print(MEDVS)

print('medvs skew        :',skew(MEDVS['medvs']))
print('log(medvs+1) skew :',skew(MEDVS['log(medvs+1)']))
print('sqrt(medvs) skew  :',skew(MEDVS['sqrt(medvs)']))

hst(MEDVS['medvs']), hst(MEDVS['log(medvs+1)']), hst(MEDVS['sqrt(medvs)'])

>So, when is the skewness too much?  The rule of thumb seems to be:
> - If the skewness is between -0.5 and 0.5, the data are fairly symmetrical
> - If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed
> - If the skewness is less than -1 or greater than 1, the data are highly skewed

>So from this we can conclude that taking the square root gives us a semmetric skew 
and since we have all +ve values in MEDV column, we can say that a square root transformation would be appropriate.

>Check [hear](https://stats.stackexchange.com/questions/107610/what-is-the-reason-the-log-transformation-is-used-with-right-skewed-distribution) or [hear](https://www.spcforexcel.com/knowledge/basic-statistics/are-skewness-and-kurtosis-useful-statistics) to get some details about log transformation, squar root transformation and skewness of a plot.

## Now lets find the features that has more than 0.75 skew

In [None]:
# peeking into the skew of each feature

df1=df.drop('MEDV',axis=1)
df1_skew=df1.apply(lambda x:skew(x))
print(df1_skew.sort_values(ascending=False))

In [None]:
# looking into the features with skewness greater than 0.75 and less than -0.75



df1_skew = df1_skew[df1_skew > 0.75]

print('-----Skewness greater than 0.75-----')
print(df1_skew)



In [None]:
# transforming the +ve skewed features with logarithm

df1_skew = df1_skew.index

df1[df1_skew] = np.log1p(df1[df1_skew])
df1[df1_skew]

In [None]:
#plotting for the variables crim, zn, chas, dis, rad, lstat, b and ptratio 
# to check wether the skewness improved

hst(df1["CRIM"]),
hst(df1["ZN"]), 
hst(df1["CHAS"]), 
hst(df1["DIS"]),
hst(df1["RAD"]), 
hst(df1["LSTAT"]), 

In [None]:
X = df.drop(columns=["MEDV"],axis=1)
y = df["MEDV"]

In [None]:
X.shape, y.shape

# **Model** 

## Using Linear Regression

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Performing train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)

In [None]:
# creating the object

model = LinearRegression()

In [None]:
# creating the object

model.fit(X_train,y_train)

In [None]:
# finding the accuracy

acr = model.score(X_test,y_test)*100
acr

In [None]:
# finding the Root Mean Squared Error

y_predict = model.predict(X_test) 
rmse= np.sqrt(mean_squared_error(y_predict,y_test))
rmse

## Using Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

# creating the object

model1 = RandomForestRegressor()

In [None]:
# fitting the training values

model1.fit(X_train,y_train)

In [None]:
# finding the accuracy

acr1 = model1.score(X_test,y_test)*100
acr1

In [None]:
# finding the Root Mean Squared Error

y_predict1 = model1.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_predict1,y_test))
rmse1

## Using Decission Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

# creating the object

model2 = DecisionTreeRegressor()

In [None]:
# fitting the training values

model2.fit(X_train,y_train)

In [None]:
# finding the accuracy

acr2 = model2.score(X_test,y_test)*100

In [None]:
# finding the Root Mean Squared Error

y_predict2=model2.predict(X_test)
rmse2= np.sqrt(mean_squared_error(y_predict2,y_test))
rmse2

# Result/Outcome

In [None]:
rslt=pd.DataFrame({
    "Model used" : ['Linear Regression','Random Forest' ,'Decision Tree'],
    "RMSE" :[rmse,rmse1,rmse2],
    "Accuracy(%)":[acr,acr1,acr2]
})

rslt.sort_values("Accuracy(%)")

# Since the Random Forest algo gave out the best results we will now use it on out refined data

In [None]:
df1

In [None]:
df['MEDV']=np.sqrt(df['MEDV'])
print("Skew :",skew(df['MEDV']))
hst(df['MEDV'])

In [None]:
X1= df1
y1 = df['MEDV']
X1.head() , y1.head()

## Performing train_test_split

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1,train_size=0.8,random_state=42)

# Model

In [None]:
# creating the object for Random Forest

rf_model = RandomForestRegressor()

In [None]:
rf_model.fit(X1_train,y1_train)

In [None]:
acr3 = rf_model.score(X1_test,y1_test) * 100
acr3

In [None]:
y1_pred = rf_model.predict(X1_test)
rmse3 = np.sqrt(mean_squared_error(y1_pred,y1_test))
rmse3

# Outcome

In [None]:
result=pd.DataFrame({
    "Model used" : ['Linear Regression','Random Forest' ,'Decision Tree','Random Forest Optimized'],
    "RMSE" :[rmse,rmse1,rmse2, rmse3],
    "Accuracy(%)":[acr,acr1,acr2, acr3]
})
result.sort_values("RMSE")