In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [95]:
df = pd.read_csv(r"C:\Users\ruben\OneDrive\Desktop\Linear Regression\real_estate_dataset.csv")
df.columns = df.columns.str.lower()
df.drop('id', axis=1, inplace=True)
df.head()

Unnamed: 0,square_feet,num_bedrooms,num_bathrooms,num_floors,year_built,has_garden,has_pool,garage_size,location_score,distance_to_center,price
0,143.63503,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.69688
3,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,89.00466,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523


In [96]:
for col in df.columns:
    df[col] = df[col].fillna(df[col].mean())
    
df.isna().sum()

square_feet           0
num_bedrooms          0
num_bathrooms         0
num_floors            0
year_built            0
has_garden            0
has_pool              0
garage_size           0
location_score        0
distance_to_center    0
price                 0
dtype: int64

In [97]:
interactions = {}
X = df.drop('price', axis=1)
y = df['price']

for (col1, col2) in combinations(X.columns, 2):
    interaction = X[col1] * X[col2]
    correlation = abs(pd.concat([interaction, y], axis=1).corr().iloc[0,1])
    interactions[f"{col1} * {col2}"] = correlation

filtered_interactions = {k: v for k, v in interactions.items() if v > 0.3}
filtered_interactions

{'square_feet * num_bedrooms': 0.7969942958145663,
 'square_feet * num_bathrooms': 0.4932194605767344,
 'square_feet * num_floors': 0.49529977107884227,
 'square_feet * year_built': 0.5781951656775672,
 'square_feet * has_garden': 0.32801960925111684,
 'square_feet * has_pool': 0.3292431289313659,
 'square_feet * garage_size': 0.42881914067954435,
 'square_feet * location_score': 0.3976033600574494,
 'square_feet * distance_to_center': 0.30233503350097,
 'num_bedrooms * num_bathrooms': 0.5352688618267694,
 'num_bedrooms * num_floors': 0.521298307474861,
 'num_bedrooms * year_built': 0.5799559184728605,
 'num_bedrooms * has_garden': 0.34271830691412947,
 'num_bedrooms * has_pool': 0.3732296027428429,
 'num_bedrooms * garage_size': 0.4607898162609456,
 'num_bedrooms * location_score': 0.4251987215473039,
 'num_bedrooms * distance_to_center': 0.3682212531274956}

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

y_pred = regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Squared Error: 437730359.70827323
Root Mean Squared Error: 20922.006588954922
R-squared: 0.970899139169718


In [99]:
coefficients_reg = regressor.coef_

coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": coefficients_reg
})
coefficients

Unnamed: 0,Feature,Coefficient
0,square_feet,75637.141344
1,num_bedrooms,72935.352705
2,num_bathrooms,24698.819056
3,num_floors,15968.728418
4,year_built,53876.740006
5,has_garden,15150.230434
6,has_pool,23119.252885
7,garage_size,13068.955176
8,location_score,13882.933585
9,distance_to_center,-10560.278434


In [100]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X_with_const = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]
vif_data

Unnamed: 0,Feature,VIF
0,const,3131.095699
1,square_feet,1.021886
2,num_bedrooms,1.012609
3,num_bathrooms,1.014926
4,num_floors,1.017003
5,year_built,1.010661
6,has_garden,1.013416
7,has_pool,1.037009
8,garage_size,1.02041
9,location_score,1.014996


In [122]:
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_scaled_df.index = X_test.index

record = X_test_scaled_df.iloc[20]

y_pred = regressor.predict(record.values.reshape(1, -1))

true_index = record.name
true_value = y_test.loc[true_index]

print(f"Predicted value: {y_pred[0]}")
print(f"True value: {true_value}")

Predicted value: 714850.7389696536
True value: 728995.9774295713
