In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
import seaborn as sns

In [None]:
os.chdir("C:/Users/Dell/BusinessAnalyst/1.Boston_Housing_Price")

In [None]:
os.getcwd()

In [None]:
data = pd.read_csv("Boston.csv")

In [None]:
data.head(10)

# Missing Value Analysis

In [None]:
data = data.drop('Unnamed: 0', axis =1)

In [None]:
data.dtypes

In [None]:
#creating dataframe with missing values
missing_value = pd.DataFrame(data.isnull().sum())
missing_value

#### We dont have any Missing Value.

# Outlier Analysis

In [None]:
#outliers
df = data.copy()

In [None]:
data['Chas'] = data['Chas'].astype('category')

In [None]:
#plotting boxplot
%matplotlib inline
for i in df:
    print(i)
    plt.boxplot(df[i])
    plt.show()

In [None]:
cnames = data.columns
cnames

In [None]:
#detect and delete outliers
def removeOutliers(data, cnames):
    for i in cnames:
        print(i)
        q75, q25 = np.percentile(data.loc[:,i], [75 ,25])
        iqr = q75 - q25

        min=q25 - (iqr*1.5)
        max=q75 + (iqr*1.5)
    
        data = data.drop(data[data.loc[:,i]<min].index)
        data = data.drop(data[data.loc[:,i]>max].index)
    return data

In [None]:
df = removeOutliers(df, cnames)

In [None]:
df.shape

#### Outliers are removed. 

# Feature Selection

In [None]:
#correlation analysis, plot
df_corr = data.iloc[:, cnames != 'MedianValue']

In [None]:
#set the width and height of plot
f, ax = plt.subplots(figsize=(20,10))

#generate correlation
corr = df_corr.corr()

#plot using seaborn
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, annot = True, ax=ax)

In [None]:
for row in corr:
    for col in corr:
        if row != col:
            if corr[row][col] > 0.8:
                if 
                    print(row, '-', col, '=', corr[row][col])
            

#### Highly positively correlated columns that are more than 0.8
     Tax - Radial
#### Since Radial is highly correlated, we will drop that variable

In [None]:
df = df.drop('Radial',axis =1)

In [None]:
#save categorical varibles
cat_names = ["Chas"]

In [None]:
#correlation analysis for categorical variable
for i in cat_names:
    print(i)
    chi2, p, dof, ex = chi2_contingency(pd.crosstab(data['Chas'],data[i]))
    print(p)

# Feature Scaling

In [None]:
df=data.copy()

In [None]:
#checking for normality
#plt.hist(data['Room'], bins='auto')

for i in cnames:
    print(i)
    plt.subplot(1, 2, 2)  # 1 line, 2 rows, index nr 1 (first position in the subplot)
    plt.hist(df[i])
    plt.show()

#### Since the values are continuous we use Normalisation

In [None]:
cnames = data.iloc[:,0:13].columns

In [None]:
cnames = cnames.drop('Chas')

In [None]:
#normalisation
for i in cnames:
    data[i] = (data[i] - data[i].min()) / (data[i].max() - data[i].min())

In [None]:
data.head(5)

##### Here all the values are in the range between 0 to 1

In [None]:
#dividing data to train and test
X=data.iloc[:,0:13]
Y=data.iloc[:,13]

x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size = 0.2)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
#training the model
regressor = LinearRegression().fit(x_train, y_train)
#to make predictions on test data
y_pred = regressor.predict(x_test)

train_accuracy = regressor.score(x_train, y_train)
test_accuracy = regressor.score(x_test, y_test)

print('Training Accuracy is ', train_accuracy*100)
print('Testing Accuracy is ', test_accuracy*100)

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})

In [None]:
df1 = df.head(10)
df1.plot(kind='bar',figsize=(16,8))
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.show()

In [None]:
def MAPE(y_test, y_pred):
    mape = np.mean(np.abs((y_test - y_pred)/y_test))*100
    return mape

In [None]:
MAPE(y_test, y_pred)

# Decision Tree Regressor Method

In [None]:
from sklearn import tree
#training the model
decision_regressor = tree.DecisionTreeRegressor().fit(x_train, y_train)
#to make predictions on test data
y_pred = decision_regressor.predict(x_test)

train_accuracy = decision_regressor.score(x_train, y_train)
test_accuracy = decision_regressor.score(x_test, y_test)

print('Training Accuracy is ', train_accuracy*100)
print('Testing Accuracy is ', test_accuracy*100)

#### Done