<a href="https://colab.research.google.com/github/ShriramJana/AI-Workshop/blob/main/Linear_Regression_(From_Scratch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
# Import Statements
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import minmax_scale

In [32]:
# Read in our train and test data 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
# Concatenate both train and test data into data 
data = pd.concat((train_data, test_data), sort=False)

In [33]:
# Lists data for the first 5 rows of the dataset
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [35]:
# get the number of missing data points per column
missing_values_count = data.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
                 ... 
MoSold              0
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
Length: 81, dtype: int64

In [36]:
# how many total missing values do we have?
total_cells = np.product(data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

6.523458481891735


In [37]:
# Preprocessing (transform non-numerical labels (as long as they are hashable and comparable) to numerical labels)
for col in range(data.shape[1]):
  encoder = LabelEncoder()
  encoder.fit(data.iloc[:, col])
  data.iloc[:, col] = encoder.transform(data.iloc[:, col])

In [38]:
#Filling NA with mean of the dataset
data = data.fillna(data.mean())
data.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [39]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,5,3,41,619,1,2,3,3,0,...,0,3,4,4,0,1,2,8,4,412
1,1,0,3,56,895,1,2,3,3,0,...,0,3,4,4,0,4,1,8,4,339
2,2,5,3,44,1266,1,2,0,3,0,...,0,3,4,4,0,8,2,8,4,442
3,3,6,3,36,883,1,2,0,3,0,...,0,3,4,4,0,1,0,8,0,194
4,4,5,3,60,1670,1,2,0,3,0,...,0,3,4,4,0,11,2,8,4,494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2914,13,4,0,23,1,2,3,3,0,...,0,3,4,4,0,5,0,8,4,663
1455,2915,13,4,0,20,1,2,3,3,0,...,0,3,4,4,0,3,0,8,0,663
1456,2916,0,3,121,1863,1,2,3,3,0,...,0,3,4,4,0,8,0,8,0,663
1457,2917,9,3,38,1090,1,2,3,3,0,...,0,3,2,2,17,6,0,8,4,663


In [40]:
# Scaling for the dataset
for col in data.columns:
  data[col] = pd.Series(minmax_scale(data[col]), index=data.index)

In [41]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.000000,0.333333,0.6,0.320312,0.317436,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.090909,0.50,0.888889,0.8,0.621418
1,0.000343,0.000000,0.6,0.437500,0.458974,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.363636,0.25,0.888889,0.8,0.511312
2,0.000685,0.333333,0.6,0.343750,0.649231,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.727273,0.50,0.888889,0.8,0.666667
3,0.001028,0.400000,0.6,0.281250,0.452821,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.090909,0.00,0.888889,0.0,0.292609
4,0.001371,0.333333,0.6,0.468750,0.856410,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,1.000000,0.50,0.888889,0.8,0.745098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.998629,0.866667,0.8,0.000000,0.011795,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.454545,0.00,0.888889,0.8,1.000000
1455,0.998972,0.866667,0.8,0.000000,0.010256,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.272727,0.00,0.888889,0.0,1.000000
1456,0.999315,0.000000,0.6,0.945312,0.955385,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.000000,0.727273,0.00,0.888889,0.0,1.000000
1457,0.999657,0.600000,0.6,0.296875,0.558974,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.5,0.5,0.459459,0.545455,0.00,0.888889,0.8,1.000000


In [43]:
# Splitting the train and test datasets
X_train = data[:train_data.shape[0]]
Y_train= X_train['SalePrice']
X_test = data[train_data.shape[0]:]

X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()

In [47]:
# Linear Regression Function
def linear_reg(w, x, b):
  return np.dot(w, x) + b

# Setting initial values for w and b
w = np.zeros((81,), dtype=int)
b = 0

In [58]:
def cost(x, y, w, b): 
    m = x.shape[0]
    cost = 0.0
    for i in range(m):                                
        prod = np.dot(x[i], w) + b           
        cost = cost + (prod - y[i])**2
    cost = cost / (2 * m)                          
    return cost
def gradient(X, y, w, b): 
    m = X.shape[0]
    n = X.shape[1] # weight for dot product      
    grad_w = np.zeros((n,)) #gradient for w
    grad_b = 0 # gradient for b

    for i in range(m):                             
        loss = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            grad_w[j] = grad_w[j] + loss * X[i, j]    
        grad_b = grad_b + loss                        
    grad_w = grad_w / m                                
    grad_b = grad_b / m                                
        
    return grad_b, grad_w
def gradient_descent(X, y, w, b, alpha,num_iters, cost, gradient):  
    for i in range(num_iters):
        dj_db,dj_dw = gradient(X, y, w, b)   

        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
          
        if i% 100 == 0:
            print('Iteration: ' + str(i) + ', Loss: ' + str(cost(X,y,w,b)))
        
    return w, b

In [59]:
gradient_descent(X_train, Y_train, w, b, 0.01, 1000, cost, gradient)

Iteration: 0, Loss: 0.08563956865391657
Iteration: 100, Loss: 0.010560093822238756
Iteration: 200, Loss: 0.0056203497201659436
Iteration: 300, Loss: 0.004169907501344975
Iteration: 400, Loss: 0.0036347950021395057
Iteration: 500, Loss: 0.0033623881732875627
Iteration: 600, Loss: 0.0031808512191443383
Iteration: 700, Loss: 0.0030405814035117133
Iteration: 800, Loss: 0.0029243685538938623
Iteration: 900, Loss: 0.002824477215383811


(array([-0.00297964, -0.00409009, -0.01169503,  0.01361804,  0.05325521,
         0.00995154,  0.0089894 , -0.0178893 , -0.00266642, -0.00045915,
         0.00055948,  0.01165849,  0.01634163,  0.0043776 ,  0.00088757,
        -0.01839277,  0.01542431,  0.06689642,  0.01847536,  0.04221574,
         0.06265635,  0.00995226,  0.00893824,  0.00224752,  0.0101621 ,
         0.00780928,  0.02896984, -0.04371589,  0.008818  ,  0.01810154,
        -0.04076149,  0.00228802, -0.02103097, -0.01104326,  0.04722599,
         0.00167501,  0.0056926 ,  0.02006399,  0.06934538,  0.00089879,
        -0.04099886,  0.02170323,  0.01380174,  0.06844   ,  0.06653387,
         0.00036514,  0.10375942,  0.01488281, -0.00169794,  0.03985701,
         0.03375143,  0.01325504, -0.00691687, -0.04468849,  0.03181488,
         0.0218202 ,  0.0397663 , -0.04760631, -0.03922729,  0.01990455,
        -0.04716812,  0.0342838 ,  0.06211648,  0.00193457,  0.00515524,
         0.01827518,  0.02769139,  0.03019967,  0.0