# Multiple Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv("kc_house_data.csv")

In [3]:
data.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21611 non-null  float64
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [5]:
data = data.drop(['id', 'date'], axis=1)

In [9]:
# Remove features with correlation < 0.2
data = data.drop(['sqft_lot', 'condition', 'yr_built', 'yr_renovated', 'zipcode', 'long',
              'sqft_lot15'], axis=1)


In [11]:
# Remove features which are highly correlated with "sqft_living"
data = data.drop(['sqft_above', 'sqft_living15'], axis=1)

In [26]:
# Normalize the features
data.iloc[:, 1:] = (data - data.mean())/data.std()

In [52]:
X=data[['bedrooms','bathrooms','sqft_living','floors','waterfront','view','grade','sqft_basement','lat']]
y = data['price']

In [42]:
from sklearn.model_selection import train_test_split
X_train , X_test,y_train,y_test = train_test_split(X , y , test_size=0.2, random_state=101)

In [49]:
y_test

3834     349950.0
1348     450000.0
20366    635000.0
16617    355500.0
20925    246950.0
           ...   
6466     455000.0
12560    445000.0
319      268750.0
16682    433000.0
15131    135000.0
Name: price, Length: 4323, dtype: float64

In [43]:
from sklearn.linear_model import LinearRegression

In [44]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [45]:
y_pred=regressor.predict(X_test)

In [46]:
print(y_pred)

[349950. 450000. 635000. ... 268750. 433000. 135000.]


In [47]:
print(np.array(y_test))

[349950. 450000. 635000. ... 268750. 433000. 135000.]


# Polynomial Regression

In [35]:
from sklearn.preprocessing import PolynomialFeatures

In [53]:
poly_reg=PolynomialFeatures(degree=2)
X_poly=poly_reg.fit_transform(X)
poly_reg_2=LinearRegression()
poly_reg_2.fit(X_poly,y)

LinearRegression()

In [54]:
poly_reg_2.predict(poly_reg.fit_transform(X_test))

array([349950., 450000., 635000., ..., 268750., 433000., 135000.])

In [55]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate metrics for Multiple Regression
mse_multiple = mean_squared_error(y_test, y_pred)
rmse_multiple = np.sqrt(mse_multiple)
r2_multiple = r2_score(y_test, y_pred)

# Calculate metrics for Polynomial Regression
X_test_poly = poly_reg.transform(X_test)
y_pred_poly = poly_reg_2.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)
r2_poly = r2_score(y_test, y_pred_poly)

# Print the metrics
print("Multiple Regression:")
print("RMSE:", rmse_multiple)
print("R-squared:", r2_multiple)

print("\nPolynomial Regression:")
print("RMSE:", rmse_poly)
print("R-squared:", r2_poly)


Multiple Regression:
RMSE: 4.481179577694976e-10
R-squared: 1.0

Polynomial Regression:
RMSE: 9.548252023883746e-10
R-squared: 1.0
