# Regression Analysis-Pridicting house price

# Import the Libraries

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Setting visualization parameters
plt.rcParams['figure.figsize'] = [8,6]
sns.set_style('darkgrid')

# Import Data

In [None]:
# Read dataset to pandas dataframe
house = pd.read_csv('/Users/priyankac/Downloads/Machine learning/Maison.csv')
house.head()

# Data Transformations and Analysis

In [None]:
# Renaming the columns with English names
house = house.rename(index = str, columns = {'PRIX':'price','SUPERFICIE':'area','CHAMBRES':'rooms','SDB':'bathroom',
                                            'ETAGES':'floors','ALLEE':'driveway','SALLEJEU':'game_room',
                                             'CAVE':'cellar','GAZ':'gas','AIR':'air','GARAGES':'garage',
                                             'SITUATION':'situation'})
house.head()

In [None]:
# Getting the detailed information about data
house.info()

In [None]:
##### We can see that the data set has 546 rows and 12 columns. All the columns are numerical type.
##### No missing values shows.
##### We need to further cross verify this information. 

In [None]:
# Checking for missing values
house.isnull().sum()


In [None]:
# Visual check for missing data
sns.heatmap(house.isnull(), cmap = 'Reds')

In [None]:
#### Cross check shows that we have no missing value

In [None]:
# Checking for duplicate rows
duplicate_rows_house = house[house.duplicated()]
duplicate_rows_house


In [None]:
# We can see that there is one row that is duplicated and it needs to be dropped
# Count the data before dropping the duplicate row
house.count()

In [None]:
# Dropping the duplicates
house = house.drop_duplicates()
house.head()

In [None]:
# Counting the number of rows after dropping the duplicates
house.count()

In [None]:
# Correlation Check
corr_mat = house.corr()
corr_mat

In [None]:
# Print heatmap for correlation
sns.heatmap(corr_mat, annot = True,
           square = True, cmap = 'Reds',
           fmt = '1.2f')

In [None]:
#### From the correlation matrix and heatmap we can see that none of the columns are correlated

In [None]:
# Zero Variance Check
zero_var = house.nunique()
zero_var

In [None]:
#### None of the columns have just one value.Hence there is no zero variance

In [None]:
# Descriptive Statistics of the variables in the data
house.describe()

In [None]:
# Analysis on 'price' column
sns.distplot(house['price'])
plt.show()


In [None]:
# plotting boxplot
sns.boxplot(house['price'])
plt.show()

In [None]:
# Distplot of 'area'
sns.distplot(house['area'])

In [None]:
sns.boxplot(house['area'])

In [None]:
# Relationship between 'price' and 'area' of houses
plt.scatter(house['area'], house['price'])
plt.xlabel('area')
plt.ylabel('price')
plt.show()

In [None]:
# Relationship between house 'price' and 'rooms','bathroom','floors','game_room','cellar','gas','air','garage','situation'
sns.set()
cols = ['price','rooms','bathroom','floors','game_room','cellar','gas','air','garage','situation']
sns.pairplot(house[cols] , size = 2.5)
plt.show()


In [None]:
# The describe function, boxplot and scatter plot show the presence of outliers
# Checking for outliers
# Calculate the Interquartile Range
q1 = house.quantile(0.25)
q3 = house.quantile(0.75)
inter_quartile_range = q3-q1


house_new = house[((house >= q1) & (house <= q3)).any(axis = 1)]
house_new.shape


# Initiate Linear Regression Object

In [None]:
lm = LinearRegression()

# Test Train split

In [None]:
# Splitting data into training and testing
house_new.columns

X = house_new[['area','rooms','bathroom','floors','driveway','game_room','cellar','gas','air','garage','situation']]

y = house_new['price']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size = 0.3 , random_state = 42)

In [None]:
# Variables are randomly selected after splitting
X_train.head()

# Pre processing and Standardization


In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import statsmodels.api as sm

In [None]:
# Statsmodel does not add intercept like sklearn model.It needs to be manually added when using stats model
# x represents the predictor variable and y represents the predicted variable
# We need to manually add intercepts

X_endog = sm.add_constant(X_test)

In [None]:
res = sm.OLS(y_test , X_endog)
res.fit()

In [None]:
res.fit().summary()

In [None]:
X_endog_test = sm.add_constant(X_test)
model = res.fit()
predictions = model.predict(X_endog_test)

In [None]:
predictions

In [None]:
# Joining the training and test data to form 1 dataset
data_actual = pd.concat([X_test,y_test] , axis = 1)
data_actual

In [None]:

data_predicted = pd.concat([data_actual, predictions], axis = 1)
data_predicted

In [None]:
data_predicted['New_col'] = data_predicted['price']-predictions
data_predicted

In [None]:
data_predicted.to_csv('/Users/priyankac/Downloads/Machine learning/data_predicted.csv')