In [1]:
import numpy as np
import pandas as pd
from linear_regression import LinearRegression
from preprocessing import OneHotEncoder, StandardScaler
from metrics import RegressionMetrics
from imputer import SimpleImputer

In [2]:
df = pd.read_csv(r'C:\Users\User\OneDrive\Desktop\Linear Regression\Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
print(f'The shape of this dataframe is {df.shape} and its information is below:')
df.info()

The shape of this dataframe is (545, 13) and its information is below:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
X = df.drop('price', axis=1)
y = df['price']

In [6]:
X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
X_num = X.select_dtypes(include='number')
X_num.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking
0,7420,4,2,3,2
1,8960,4,4,4,3
2,9960,3,2,2,2
3,7500,4,2,2,3
4,7420,4,1,2,2


In [8]:
X_cat = df.select_dtypes(exclude='number')
X_cat.head()

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,yes,no,no,no,yes,yes,furnished
1,yes,no,no,no,yes,no,furnished
2,yes,no,yes,no,no,yes,semi-furnished
3,yes,no,yes,no,yes,yes,furnished
4,yes,yes,yes,no,yes,no,furnished


In [9]:
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)
y = scaler.fit_transform(y)

In [10]:
# Assuming you're using pandas
categorical = X.select_dtypes(include='object')
categorical.head()


Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,yes,no,no,no,yes,yes,furnished
1,yes,no,no,no,yes,no,furnished
2,yes,no,yes,no,no,yes,semi-furnished
3,yes,no,yes,no,yes,yes,furnished
4,yes,yes,yes,no,yes,no,furnished


In [12]:
encoder = OneHotEncoder()
X_enc = encoder.fit_transform(categorical)
X_enc

array([[0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 0., 0., 1.]], shape=(545, 15))

In [14]:
X = np.hstack([X_num, X_enc])
X

array([[ 1.04672629,  1.40341936,  1.42181174, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.75700953,  1.40341936,  5.40580863, ...,  1.        ,
         0.        ,  0.        ],
       [ 2.21823241,  0.04727831,  1.42181174, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.70592066, -1.30886273, -0.57018671, ...,  0.        ,
         0.        ,  1.        ],
       [-1.03338891,  0.04727831, -0.57018671, ...,  1.        ,
         0.        ,  0.        ],
       [-0.5998394 ,  0.04727831, -0.57018671, ...,  0.        ,
         0.        ,  1.        ]], shape=(545, 20))

In [15]:
model = LinearRegression()
model.fit(X, y)

In [17]:
y_pred = model.predict(X)

In [18]:
y_pred

array([ 1.79532187e+00,  3.13895137e+00,  1.55809806e+00,  1.90705655e+00,
        1.03583597e+00,  1.95613748e+00,  2.70663438e+00,  2.01026631e+00,
        1.45460270e+00,  1.54148596e+00,  1.90486541e+00,  1.67943811e+00,
        1.25786503e+00,  4.86989031e-01,  7.72478429e-01,  2.35816863e-01,
        1.41712201e+00,  1.79151368e+00,  9.43319676e-01,  1.17782835e+00,
        1.67859551e-01,  1.00533908e+00,  6.55447049e-01,  9.77883381e-01,
        1.33814770e+00,  1.78447161e+00,  1.81281550e+00, -4.30216820e-02,
        1.10759963e+00,  1.31348775e+00,  1.43535867e+00,  1.03645856e+00,
        1.06093515e+00,  1.04277141e+00,  8.68589609e-01,  1.59422528e+00,
        1.36962691e+00,  1.92021484e+00,  8.63889060e-01,  1.34902030e+00,
        7.22420430e-01,  1.53157848e+00,  1.30449069e+00,  1.06996478e+00,
        1.37173816e+00,  8.85770244e-01,  1.30879981e+00,  1.48979538e+00,
        1.31810564e-01,  1.26297686e+00,  1.17322934e+00,  5.95155511e-01,
        1.52931767e+00,  

In [19]:
metrics = RegressionMetrics(y, y_pred)
print('Baseline Model:', y.mean())
print("Test MSE:", metrics.mse())
print("Test R²:", metrics.r2_score())

Baseline Model: 2.0859970224149731e-16
Test MSE: 0.3207220367198318
Test R²: 0.679277963280168


In [20]:
print("Learned weights:", model.weights)
print("Learned bias:", model.bias) 

Learned weights: [ 0.28516712  0.04640989  0.26859903  0.21356421  0.13247623 -0.08431091
  0.14456373 -0.04866334  0.10891616 -0.06673589  0.12698871 -0.07784562
  0.13809843 -0.18635128  0.2466041  -0.13453825  0.19479107  0.10143732
  0.07871945 -0.11990395]
Learned bias: 0.06025281698077473
