## Reading and Understanding the Data

In [1]:
#importing  the important libraries
import pandas as pd
import numpy as np

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the input data 
df=pd.read_csv('Housing.csv')
#check the data
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
df.shape

(545, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [6]:
#describe the data
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [7]:
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.price >= Q1 - 1.5*IQR) & (df.price <= Q3 + 1.5*IQR)]

In [8]:
Q1 = df.area.quantile(0.25)
Q3 = df.area.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.area >= Q1 - 1.5*IQR) & (df.area <= Q3 + 1.5*IQR)]

In [9]:
# List of variables to map

varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
df[varlist] = df[varlist].apply(binary_map)

In [10]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
15,9100000,6000,4,1,2,1,0,1,0,0,2,0,semi-furnished
16,9100000,6600,4,2,2,1,1,1,0,1,1,1,unfurnished
17,8960000,8500,3,2,4,1,0,0,0,1,2,0,furnished
18,8890000,4600,3,2,2,1,1,0,0,1,2,0,furnished
19,8855000,6420,3,2,2,1,0,0,0,1,1,1,semi-furnished


In [11]:
# Get the dummy variables for the feature 'furnishingstatus' and store it in a new variable - 'status'
status = pd.get_dummies(df['furnishingstatus'], drop_first = True)

In [12]:
status.head()

Unnamed: 0,semi-furnished,unfurnished
15,1,0
16,0,1
17,0,0
18,0,0
19,1,0


In [13]:
# Add the results to the original housing dataframe

df = pd.concat([df, status], axis = 1)

In [14]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
15,9100000,6000,4,1,2,1,0,1,0,0,2,0,semi-furnished,1,0
16,9100000,6600,4,2,2,1,1,1,0,1,1,1,unfurnished,0,1
17,8960000,8500,3,2,4,1,0,0,0,1,2,0,furnished,0,0
18,8890000,4600,3,2,2,1,1,0,0,1,2,0,furnished,0,0
19,8855000,6420,3,2,2,1,0,0,0,1,1,1,semi-furnished,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,unfurnished,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,semi-furnished,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,unfurnished,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,furnished,0,0


In [15]:
# Drop 'furnishingstatus' as we have created the dummies for it

df.drop(['furnishingstatus'], axis = 1, inplace = True)

In [16]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
15,9100000,6000,4,1,2,1,0,1,0,0,2,0,1,0
16,9100000,6600,4,2,2,1,1,1,0,1,1,1,0,1
17,8960000,8500,3,2,4,1,0,0,0,1,2,0,0,0
18,8890000,4600,3,2,2,1,1,0,0,1,2,0,0,0
19,8855000,6420,3,2,2,1,0,0,0,1,1,1,1,0


In [17]:
df.corr()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
price,1.0,0.533186,0.335328,0.466483,0.446839,0.307185,0.303994,0.190117,0.075979,0.471706,0.321058,0.280808,0.085766,-0.278463
area,0.533186,1.0,0.127745,0.171528,0.109877,0.302116,0.213826,0.042227,-0.020489,0.26468,0.357276,0.208014,0.010588,-0.121219
bedrooms,0.335328,0.127745,1.0,0.34623,0.406684,-0.025775,0.078443,0.102261,0.042953,0.150378,0.105561,0.061014,0.056241,-0.122155
bathrooms,0.466483,0.171528,0.34623,1.0,0.311314,0.024614,0.152035,0.124691,0.046486,0.175634,0.127102,0.027817,0.024132,-0.129211
stories,0.446839,0.109877,0.406684,0.311314,1.0,0.120759,0.034301,-0.166031,0.027834,0.293854,0.022043,0.021529,-0.004171,-0.085689
mainroad,0.307185,0.302116,-0.025775,0.024614,0.120759,1.0,0.095201,0.041789,-0.019471,0.102198,0.195635,0.193613,0.011257,-0.12556
guestroom,0.303994,0.213826,0.078443,0.152035,0.034301,0.095201,1.0,0.373599,-0.022922,0.139242,0.041133,0.184999,0.007601,-0.108383
basement,0.190117,0.042227,0.102261,0.124691,-0.166031,0.041789,0.373599,1.0,-0.012428,0.034796,0.03848,0.22918,0.054119,-0.108168
hotwaterheating,0.075979,-0.020489,0.042953,0.046486,0.027834,-0.019471,-0.022922,-0.012428,1.0,-0.119731,0.05251,-0.041079,0.074875,-0.04955
airconditioning,0.471706,0.26468,0.150378,0.175634,0.293854,0.102198,0.139242,0.034796,-0.119731,1.0,0.12959,0.087044,-0.043557,-0.086983


In [18]:
#define the input [x] and the output / prediction data
#define the y first 
y=df['price']
# now the input data 
X=df.drop(['price'], axis=1)

In [19]:
train_pct_index = int(0.8 * len(X))
x_train, x_test = X[:train_pct_index].values, X[train_pct_index:].values
y_train, y_test = y[:train_pct_index].values, y[train_pct_index:].values

In [20]:
#lets check the shape of the data
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((413, 13), (104, 13), (413,), (104,))

In [21]:
mean_X_train = np.mean(x_train)

In [22]:
std_X_train = np.std(x_train)

In [23]:
mean_X_test = np.mean(x_test)

In [24]:
std_X_test = np.std(x_test)

In [25]:
#Scaling the data without sklearn
x_train = (x_train - mean_X_train) / std_X_train
x_test = (x_test - mean_X_test) / std_X_test

x_train[:5] , x_test[:5]

(array([[ 3.83423196, -0.26977518, -0.27182855, -0.27114409, -0.27182855,
         -0.27251301, -0.27182855, -0.27251301, -0.27251301, -0.27114409,
         -0.27251301, -0.27182855, -0.27251301],
        [ 4.24490645, -0.26977518, -0.27114409, -0.27114409, -0.27182855,
         -0.27182855, -0.27182855, -0.27251301, -0.27182855, -0.27182855,
         -0.27182855, -0.27251301, -0.27182855],
        [ 5.54537569, -0.27045964, -0.27114409, -0.26977518, -0.27182855,
         -0.27251301, -0.27251301, -0.27251301, -0.27182855, -0.27114409,
         -0.27251301, -0.27251301, -0.27251301],
        [ 2.87599146, -0.27045964, -0.27114409, -0.27114409, -0.27182855,
         -0.27182855, -0.27251301, -0.27251301, -0.27182855, -0.27114409,
         -0.27251301, -0.27251301, -0.27251301],
        [ 4.1217041 , -0.27045964, -0.27114409, -0.27114409, -0.27182855,
         -0.27251301, -0.27251301, -0.27251301, -0.27182855, -0.27182855,
         -0.27182855, -0.27182855, -0.27251301]]),
 array([[ 3.7

In [26]:
# creating a class for Lasso Regression

class Lasso_Regression():

  #initiating the hyperparameters
  def __init__(self, learning_rate, no_of_iterations, lambda_parameter):

    self.learning_rate = learning_rate
    self.no_of_iterations = no_of_iterations
    self.lambda_parameter = lambda_parameter


  # fitting the dataset to the Lasso Regression model
  def fit(self, X, Y):

    # m --> number of Data points --> number of rows
    # n --> number of input features --> number of columns
    self.m, self.n = X.shape

    self.w = np.zeros(self.n)

    self.b = 0

    self.X = X

    self.Y = Y

    # implementing Gradient Descent algorithm for Optimization

    for i in range(self.no_of_iterations):     # missed "self"
        self.upadte_weights()


  # function for updating the weight & bias value
  def upadte_weights(self):

    # linear equation of the model
    Y_prediction = self.predict(self.X)

    # gradients (dw, db)

    # gradient for weight
    dw = np.zeros(self.n)

    for i in range(self.n):
        
        if self.w[i]>0:

            dw[i] = (-(2*(self.X[:,i]).dot(self.Y - Y_prediction)) + self.lambda_parameter) / self.m 

        else :
 
            dw[i] = (-(2*(self.X[:,i]).dot(self.Y - Y_prediction)) - self.lambda_parameter) / self.m


    # gradient for bias
    db = - 2 * np.sum(self.Y - Y_prediction) / self.m


    # updating the weights & bias

    self.w = self.w - self.learning_rate*dw
    self.b = self.b - self.learning_rate*db

    


  # Predicting the Target variable
  def predict(self,X):

    return X.dot(self.w) + self.b

In [27]:
model = Lasso_Regression(learning_rate = 0.01, no_of_iterations=1000,
                         lambda_parameter=0.1)

In [28]:
model.fit(x_train, y_train)

In [29]:
y_pred = model.predict(x_test)

In [30]:
print(y_test)

[3220000 3220000 3220000 3220000 3150000 3150000 3150000 3150000 3150000
 3150000 3150000 3150000 3150000 3143000 3129000 3118850 3115000 3115000
 3115000 3087000 3080000 3080000 3080000 3080000 3045000 3010000 3010000
 3010000 3010000 3010000 3010000 3010000 3003000 2975000 2961000 2940000
 2940000 2940000 2940000 2940000 2940000 2940000 2940000 2870000 2870000
 2870000 2870000 2852500 2835000 2835000 2835000 2800000 2800000 2730000
 2730000 2695000 2660000 2660000 2660000 2660000 2660000 2660000 2660000
 2653000 2653000 2604000 2590000 2590000 2590000 2520000 2520000 2520000
 2485000 2485000 2450000 2450000 2450000 2450000 2450000 2450000 2408000
 2380000 2380000 2380000 2345000 2310000 2275000 2275000 2275000 2240000
 2233000 2135000 2100000 2100000 2100000 1960000 1890000 1890000 1855000
 1820000 1767150 1750000 1750000 1750000]


In [31]:
print(y_pred)

[5330049.31637262 4426766.34815648 5303676.63197806 4661208.01975871
 4838832.09777814 5123891.33720364 4864779.66550951 5182707.51260555
 3872811.77265531 4836738.98310482 6605964.19310936 7809724.89894918
 4633449.91245022 5399276.50056852 5932759.76534515 4273545.35904693
 4596489.23617348 5051459.82275201 4864779.25071967 7328803.26322962
 5646130.80332223 4146044.38327486 4644278.53117685 5399270.08266056
 5025503.95857772 4644712.81406547 4723810.50807474 4508506.75556189
 5454526.52170081 5709124.40376039 4997884.92013266 4931914.16411029
 7302437.31990001 5319989.01368087 4596490.71394826 6122114.53072823
 5646972.82896215 4917522.70218321 4949672.19553504 4853233.42630087
 4437025.01589283 4676857.1483762  6532806.48743786 4618756.34263031
 4934426.62412483 6204152.42346425 5881453.85715206 5773889.42097782
 4756798.60158604 5318911.80923046 4404458.56634116 4407717.64690909
 5111230.18873714 6632751.03715816 5131400.66870096 5132653.76381641
 5097294.32808764 4060741.68775478

In [32]:
SSres = np.sum((y_test - y_pred)**2)

In [33]:
y_pred_mean = np.mean(y_test)

In [34]:
SStot = np.sum((y_test - y_pred_mean)**2)

In [35]:
r2 = 1 - SSres/SStot

In [36]:
r2for = "{:.2f}".format(r2)
print("r2_score is " , r2for)

r2_score is  -35.26


In [37]:
mae = np.mean(np.abs(y_test-y_pred))
maefor = "{:.2f}".format(mae)
print("Absolute mean square " , maefor)

Absolute mean square  2325544.60


In [38]:
rmse = np.sqrt((np.sum((y_test-y_pred)**2))/len(y_test))
rmsefor = "{:.2f}".format(rmse)
print("Root mean squared error " , rmsefor)

Root mean squared error  2434535.65


In [39]:
mse = ((np.sum((y_test-y_pred)**2))/len(y_test))
msefor = "{:.2f}".format(mse)
print("Mean squared error " , msefor)

Mean squared error  5926963837357.82


In [42]:
def Gridsearch():

# Model training    
    max_accuracy = 0
      
    # learning_rate choices    
    learning_rates = [ 0.1, 0.2, 0.3, 0.4, 0.5, 
                      0.01, 0.02, 0.03, 0.04, 0.05 ]
      
    # iterations choices    
    no_of_iterations = [ 100, 200, 300, 400, 500 ]
    
    # lambda parameters
    lambda_parameter = [0, 0.1, 0.5, 1, 10]
      
    # available combination of learning_rate and iterations
      
    parameters = []    
    for i in learning_rates :        
        for j in no_of_iterations : 
            for a in lambda_parameter:
                parameters.append( ( i, j, a ) )
              
    print("Available combinations : ",  parameters )
              
    # Applying linear searching in list of available combination
    # to achieved maximum accuracy on CV set
      
    for k in range( len( parameters ) ) :        
        model = Lasso_Regression( learning_rate = parameters[k][0], no_of_iterations = parameters[k][1], lambda_parameter = parameters[k][2])
      
        model.fit( x_train, y_train )
        
        # Prediction on validation set
        Y_pred = model.predict( x_test )
       
        # measure performance on validation set
      
        correctly_price = 0
      
        # counter    
        count = 0
      
        for count in range( np.size( y_pred ) ) :            
            if y_test[count] == y_pred[count] :                
                correctly_price = correctly_price + 1   
                  
        curr_accuracy = ( correctly_price / count ) * 100
                  
        if max_accuracy < curr_accuracy :            
            max_accuracy = curr_accuracy
              
    print( "Maximum accuracy achieved by our model through grid searching : ", max_accuracy )
    


In [43]:
if __name__ == "__main__" :     
    Gridsearch()

Available combinations :  [(0.1, 100, 0), (0.1, 100, 0.1), (0.1, 100, 0.5), (0.1, 100, 1), (0.1, 100, 10), (0.1, 200, 0), (0.1, 200, 0.1), (0.1, 200, 0.5), (0.1, 200, 1), (0.1, 200, 10), (0.1, 300, 0), (0.1, 300, 0.1), (0.1, 300, 0.5), (0.1, 300, 1), (0.1, 300, 10), (0.1, 400, 0), (0.1, 400, 0.1), (0.1, 400, 0.5), (0.1, 400, 1), (0.1, 400, 10), (0.1, 500, 0), (0.1, 500, 0.1), (0.1, 500, 0.5), (0.1, 500, 1), (0.1, 500, 10), (0.2, 100, 0), (0.2, 100, 0.1), (0.2, 100, 0.5), (0.2, 100, 1), (0.2, 100, 10), (0.2, 200, 0), (0.2, 200, 0.1), (0.2, 200, 0.5), (0.2, 200, 1), (0.2, 200, 10), (0.2, 300, 0), (0.2, 300, 0.1), (0.2, 300, 0.5), (0.2, 300, 1), (0.2, 300, 10), (0.2, 400, 0), (0.2, 400, 0.1), (0.2, 400, 0.5), (0.2, 400, 1), (0.2, 400, 10), (0.2, 500, 0), (0.2, 500, 0.1), (0.2, 500, 0.5), (0.2, 500, 1), (0.2, 500, 10), (0.3, 100, 0), (0.3, 100, 0.1), (0.3, 100, 0.5), (0.3, 100, 1), (0.3, 100, 10), (0.3, 200, 0), (0.3, 200, 0.1), (0.3, 200, 0.5), (0.3, 200, 1), (0.3, 200, 10), (0.3, 300, 0)