In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
# Data Preprocessing

# Changing all yes/no data to 0/1, normalization of price and area, and outlier removal 

for column in df:  
    df[column].replace(('yes','no'), (0,1), inplace = True)
    if(column == 'price' or column == 'area'):
        df[column] = (df[column] - df[column].mean()) / df[column].std()
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df.drop(df[(df[column] < Q1-1.5*IQR) | (df[column] > Q3+1.5*IQR)].index, inplace = True) 
        
# Adding One Hot Encoding to furnishingstatus 

df = pd.get_dummies(df, columns = ['furnishingstatus']) 

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
15,2.316712,0.452182,4,1,2,0,1,0,1,1,2,1,0,1,0
16,2.316712,0.741276,4,2,2,0,0,0,1,0,1,0,0,0,1
17,2.241864,1.65674,3,2,4,0,1,1,1,0,2,1,1,0,0
18,2.204439,-0.222371,3,2,2,0,0,1,1,0,2,1,1,0,0
19,2.185727,0.654548,3,2,2,0,1,1,1,0,1,0,0,1,0


In [5]:
print(df.corr().price)
print(df.info())

price                              1.000000
area                               0.533186
bedrooms                           0.335328
bathrooms                          0.466483
stories                            0.446839
mainroad                          -0.307185
guestroom                         -0.303994
basement                          -0.190117
hotwaterheating                   -0.075979
airconditioning                   -0.471706
parking                            0.321058
prefarea                          -0.280808
furnishingstatus_furnished         0.207903
furnishingstatus_semi-furnished    0.085766
furnishingstatus_unfurnished      -0.278463
Name: price, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 517 entries, 15 to 544
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   price                            517 non-null    float64
 1   area           

In [140]:
# Splitting Data Set for Training and Testing
    
X = df.iloc[0:, 1:]
Y = df.iloc[0:, 0:1]
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size = 0.7, random_state = 42) 
x_train = np.array(x_train)
print(x_train.dtype)

float64


In [240]:
class LinearRegression:
    def __init__(self, x, y):
        self.w = np.zeros((x.shape[1], 1))
        self.b = float(0)
        self.X = x
        self.Y = y

    def split_data(self, train_size = 0.75, rand_state = 42):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.Y, train_size=train_size, random_state= rand_state) 
        self.x_train = (np.array(x_train))
        self.x_test = (np.array(x_test))
        self.y_test = (np.array(y_test))
        self.y_train = (np.array(y_train))
        self.m = self.x_train.shape[0]

    def get_weights(self):
        return self.w, self.b
    
    def gradient_propogator(self):
        self.A = self.x_train.dot(self.w) + self.b
        
        self.cost = np.sum((self.A - self.y_train) ** 2) / (2 * self.m)
        self.db = np.sum(self.A - self.y_train)/self.m
        self.dw = self.x_train.T.dot(self.A - self.y_train)/self.m
        
    def gradient_optimizer(self, learning_rate = 0.001, epochs = 1000):
        for self.i in range(epochs):
            self.gradient_propogator()
            self.w = self.w - (learning_rate * self.dw)
            self.b = self.b - (learning_rate * self.db)
            if(self.i % 100 == 0):
                print("Cost after %i epochs: %f" %(self.i, self.cost))
    def predict(self):
        pass


In [241]:
model = LinearRegression(X, Y)
model.split_data()
model.gradient_optimizer(0.001, 20000)
w, b = model.get_weights()
print(w, b)

Cost after 0 epochs: 0.344970
Cost after 100 epochs: 0.312023
Cost after 200 epochs: 0.286170
Cost after 300 epochs: 0.264343
Cost after 400 epochs: 0.245843
Cost after 500 epochs: 0.230132
Cost after 600 epochs: 0.216762
Cost after 700 epochs: 0.205358
Cost after 800 epochs: 0.195610
Cost after 900 epochs: 0.187256
Cost after 1000 epochs: 0.180079
Cost after 1100 epochs: 0.173897
Cost after 1200 epochs: 0.168557
Cost after 1300 epochs: 0.163931
Cost after 1400 epochs: 0.159913
Cost after 1500 epochs: 0.156412
Cost after 1600 epochs: 0.153352
Cost after 1700 epochs: 0.150669
Cost after 1800 epochs: 0.148309
Cost after 1900 epochs: 0.146227
Cost after 2000 epochs: 0.144383
Cost after 2100 epochs: 0.142747
Cost after 2200 epochs: 0.141288
Cost after 2300 epochs: 0.139985
Cost after 2400 epochs: 0.138816
Cost after 2500 epochs: 0.137765
Cost after 2600 epochs: 0.136817
Cost after 2700 epochs: 0.135958
Cost after 2800 epochs: 0.135179
Cost after 2900 epochs: 0.134470
Cost after 3000 epochs

In [239]:
# Using Scikit Learn 
regr = LinearRegression()
regr.fit(x_train, y_train)
print(regr.score(x_test, y_test))

TypeError: __init__() missing 2 required positional arguments: 'x' and 'y'

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
y_pred = regr.predict(x_test)
mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
rmse = mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False)
  
print("MAE:",mae)
print("MSE:",mse)
print("RMSE:",rmse)

MAE: 0.36836350579620564
MSE: 0.26553976678977165
RMSE: 0.5153055082082586
