# 2. Multivariate Regression

In [1]:
import pandas as pd
import numpy as np

from GradientDescent import BatchGD

### 1. Read the excel file using pandas and perform data cleaning. Remove 1st column ’id’ which may not be necessary here. Perform mean normalization of features.

In [2]:
# Reading the file prob2data.csv using pandas 
house_price  = pd.read_csv("prob2data.csv")
house_price.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,floors,yr_built
0,7129300520,221900.0,3,1.0,1180,1.0,1955
1,6414100192,538000.0,3,2.25,2570,2.0,1951
2,5631500400,180000.0,2,1.0,770,1.0,1933
3,2487200875,604000.0,4,3.0,1960,1.0,1965
4,1954400510,510000.0,3,2.0,1680,1.0,1987


In [3]:
# Removing the first column="id" , doing it inplace=True
house_price.drop(columns="id",inplace=True)
house_price.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,yr_built
0,221900.0,3,1.0,1180,1.0,1955
1,538000.0,3,2.25,2570,2.0,1951
2,180000.0,2,1.0,770,1.0,1933
3,604000.0,4,3.0,1960,1.0,1965
4,510000.0,3,2.0,1680,1.0,1987


**Mean Normalization**

Mean normalization is a technique to standardize the range of independent variables or features of data. In data processing, it is also known as **feature scaling**. In this technique, we subtract the mean and then divide the resultant by the standard deviation of each value of the column.

**Formula**

$$x_{i} = \frac{x_{i} - \mu}{\\max(x_i) - min(x_i)}$$

where,

$x_{i}$ = ith value of the column

$\mu$ = mean of the column


In [4]:
# taking out the price
price = house_price["price"]
price
house_price = house_price.drop(columns="price")

In [5]:
# calculating the mean of each column
means = house_price.mean()
means

bedrooms          3.370842
bathrooms         2.114757
sqft_living    2079.899736
floors            1.494309
yr_built       1971.005136
dtype: float64

In [6]:
maxs = house_price.max()
mins = house_price.min()

In [7]:
# Mean Normalizing

# Subtracting the mean from each column
house_price = house_price - means

# Dividing each column by its max - min value
house_price = house_price / (maxs - mins)

In [8]:
house_price.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,yr_built
0,-0.011238,-0.139345,-0.067917,-0.197724,-0.139175
1,-0.011238,0.016905,0.036989,0.202276,-0.173958
2,-0.041541,-0.139345,-0.09886,-0.197724,-0.330479
3,0.019065,0.110655,-0.009049,-0.197724,-0.052219
4,-0.011238,-0.014345,-0.030181,-0.197724,0.139086


In [9]:
# adding price column
house_price["price"] = price

In [10]:
house_price.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,yr_built,price
0,-0.011238,-0.139345,-0.067917,-0.197724,-0.139175,221900.0
1,-0.011238,0.016905,0.036989,0.202276,-0.173958,538000.0
2,-0.041541,-0.139345,-0.09886,-0.197724,-0.330479,180000.0
3,0.019065,0.110655,-0.009049,-0.197724,-0.052219,604000.0
4,-0.011238,-0.014345,-0.030181,-0.197724,0.139086,510000.0


In [11]:
# defining the X and y
X = house_price.drop(columns="price") # all except price
y = house_price["price"] # price

In [19]:
batch_gd = BatchGD(alpha=0.001, max_iter=50000, bias=True)

In [20]:
batch_gd.fit(X,y)

In [21]:
batch_gd.weights

array([ 540182.15879326,  104551.13375498,  597960.23159231,
        675867.4721859 ,  305728.55526567, -165437.09199299])

### 3. Predict the house price using the model, for 4 bedrooms, 2.5 bathrooms, 2570 sq. feet area, 2 floors, 2005 yr. built, and state the difference between the model prediction and actual value (Rs. 719000). Show in % error

- storing the x_pred and doing a Mean Normalization by using the means calculated in training data

In [22]:
x_pred = [4, 2.5, 2570, 2, 2005]
# Mean normalizing the x_pred
x_pred = (x_pred - means) / (maxs - mins)
x_pred

bedrooms       0.019065
bathrooms      0.048155
sqft_living    0.036989
floors         0.202276
yr_built       0.295608
dtype: float64

In [23]:
pred_price = batch_gd.predict(x_pred)
actual_price = 719000

In [24]:
pred_price

array([608907.12852954])

In [25]:
# % error 
((actual_price - pred_price)/actual_price)*100

array([15.31194318])

- Hence error is 15.31%