In [99]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

 <h1>Loading Dataset</h1>

In [100]:



df = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")


print(df.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

<h1> PreProcessing </h1>

The first cell selects only those features that are numbers.
The second cell replaces nan values with the mean of the column. Here the features and targets are separated. Also checked if there is nan values in the dataset.

In [101]:
data = df.select_dtypes(include=['number']).to_numpy()
print(np.shape(data))

(1460, 38)


In [102]:





imputer = SimpleImputer(strategy="mean")
z = imputer.fit_transform(data)

# Separate features and target
y = z[:, -1]   # stores last column i.e. Sales Price
x = z[:, 1:-1] # stores features excluding the id and sales prices

print("Any NaN left in x?", np.isnan(x).any())
print("Any NaN left in y?", np.isnan(y).any())


print(x.shape)

Any NaN left in x? False
Any NaN left in y? False
(1460, 36)


<h1> Feature Scaling </h1>

Scale the features as well as the target. 

In [103]:

scaler = StandardScaler()
x = scaler.fit_transform(x)


y_original = y.copy()  # Keep original for inverse transformation
y = np.log1p(y)

In [104]:
#Initializing weight and bias
n = x.shape[1]
init_w = np.zeros(n)
init_b = 0
print(init_w.shape)

(36,)


<h1> Define Linear Regression Model </h1>

In [105]:


def predict(X, w, b):
    return np.dot(X, w) + b
            


<h1> Define Cost Function </h1>

In [106]:
def calc_cost(actual,prediction):
    c=0
    cost = 0
    m = len(x)
    for i in range(m):
        c+= (actual[i] - prediction[i])**2
    cost = (1/(2*m))*c
    return cost

In [107]:


y_pred = np.zeros(len(x))
y_pred = predict(x, init_w, init_b)


J = calc_cost(y, y_pred)

Formula for Gradient:

djdw = 1/m sum(((wx_i + b) - yi)xi)
djdb = 1/m sum(((wx_i + b) - yi ))

<h1> Compute Gradient </h1>

In [108]:
def compute_gradient(X, y, w, b): 
    
    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_dw, dj_db

<h1> Run Optitimizer i.e. Gradient Descent. Updates value of weight and bias. </h1>

In [109]:
cost_history = []
alpha = 0.01

for i in range(1001):
    dj_dw, dj_db = compute_gradient(x, y, init_w, init_b)
    init_w -= alpha * dj_dw
    init_b -= alpha * dj_db
    
    if i % 100 == 0:  
        cost = calc_cost(y, predict(x, init_w, init_b))
        cost_history.append(cost)
        print(f"Iteration {i}: Cost = {cost:.2f}")

Iteration 0: Cost = 70.92
Iteration 100: Cost = 9.50
Iteration 200: Cost = 1.28
Iteration 300: Cost = 0.18
Iteration 400: Cost = 0.03
Iteration 500: Cost = 0.01
Iteration 600: Cost = 0.01
Iteration 700: Cost = 0.01
Iteration 800: Cost = 0.01
Iteration 900: Cost = 0.01
Iteration 1000: Cost = 0.01


<h1> Check and analyze the results of the model. </h1>

In [110]:
#Check the final cost of the model. Also check the final values of the parameters. 

final_cost = calc_cost(y, predict(x, init_w, init_b))
print("Final cost:", final_cost)
print(init_b)
print(init_w)

Final cost: 0.01052724505213237
12.0235434922973
[-0.02556735 -0.00206921  0.01818109  0.11574303  0.05087815  0.07683027
  0.02780015  0.00101641  0.01665529  0.00409808  0.00431824  0.02317189
  0.03316543  0.02679971  0.00243149  0.0468874   0.03227572  0.00452797
  0.02736935  0.01497799 -0.00080196 -0.0131093   0.02760851  0.03092074
 -0.00151241  0.04514439  0.01275419  0.01555378 -0.00301829  0.00856269
  0.0064876   0.01978067 -0.0148639  -0.00163028  0.00125134 -0.0095511 ]


In [111]:
#Scale back the target values.
y_pred_scaled = predict(x, init_w, init_b)
y_pred_original = np.expm1(y_pred_scaled)

rmse = np.sqrt(np.mean((y_original - y_pred_original)**2))
print(f"RMSE in original dollars: ${rmse:,.2f}")


print("\nSample predictions vs actual:")
for i in range(5):
    print(f"Predicted: ${y_pred_original[i]:,.2f}, Actual: ${y_original[i]:,.2f}")

RMSE in original dollars: $38,348.71

Sample predictions vs actual:
Predicted: $210,054.26, Actual: $208,500.00
Predicted: $196,451.22, Actual: $181,500.00
Predicted: $216,697.72, Actual: $223,500.00
Predicted: $169,033.60, Actual: $140,000.00
Predicted: $297,555.35, Actual: $250,000.00


<h1> Testing the model on the test_dataset. </h1>

In [112]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [113]:
first_col = df2['Id']

data2 = df2.select_dtypes(include=['number']).to_numpy()



imp = imputer.fit_transform(data2)

test_feat = imp[:, 1:]
print(test_feat.shape)
print("Any NaN left in x?", np.isnan(test_feat).any())

(1459, 36)
Any NaN left in x? False


In [114]:
#Scale Input

test_feat = scaler.fit_transform(test_feat)


In [115]:

#Make Prediction on Test Dataset
pred_scaled = predict(test_feat, init_w, init_b)
pred_original = np.expm1(pred_scaled)



In [116]:
submission_array = np.column_stack((first_col, pred_original))


In [117]:
submission_df = pd.DataFrame(submission_array, columns=['Id', 'SalePrice'])
submission_df['Id'] = submission_df['Id'].astype(int)


In [118]:
submission_df.to_csv('submission3.csv', index=False)


In [119]:
print(submission_df.dtypes)


Id             int64
SalePrice    float64
dtype: object
