In [82]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [47]:
data = pd.read_csv("Datasets/HousePriceIndia/ProcessedPrice.csv")

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13859 entries, 0 to 13858
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   number of bedrooms                     13859 non-null  int64  
 1   number of bathrooms                    13859 non-null  float64
 2   living area                            13859 non-null  int64  
 3   lot area                               13859 non-null  int64  
 4   number of floors                       13859 non-null  float64
 5   waterfront present                     13859 non-null  int64  
 6   condition of the house                 13859 non-null  int64  
 7   grade of the house                     13859 non-null  int64  
 8   Area of the house(excluding basement)  13859 non-null  int64  
 9   Area of the basement                   13859 non-null  int64  
 10  Built Year                             13859 non-null  int64  
 11  Re

In [49]:
correlation = data.corr()

In [50]:
price_corr = correlation['Price'].sort_values(ascending = False)
print(price_corr)

Price                                    1.000000
grade of the house                       0.647629
living area                              0.638922
living_area_renov                        0.577698
Area of the house(excluding basement)    0.546075
number of bathrooms                      0.466374
Lattitude                                0.422816
number of bedrooms                       0.298872
number of floors                         0.279132
Area of the basement                     0.237322
lot area                                 0.096008
Renovation Year                          0.091372
lot_area_renov                           0.080134
Longitude                                0.080069
Built Year                               0.066145
Year since renovation                    0.053971
waterfront present                       0.053646
condition of the house                   0.044879
Distance from the airport                0.007484
Number of schools nearby                 0.004843


In [51]:
X = data.drop(columns=["Price"])
correlation_matrix = X.corr()


high_corr_features = [(col1, col2) for col1 in correlation_matrix.columns 
                      for col2 in correlation_matrix.columns 
                      if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > 0.99]

print("Highly Correlated Features:", high_corr_features)


for col1, col2 in high_corr_features:
    X = X.drop(columns=[col2])  

Highly Correlated Features: [('Built Year', 'Age'), ('Age', 'Built Year')]


In [52]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13859 entries, 0 to 13858
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   number of bedrooms                     13859 non-null  int64  
 1   number of bathrooms                    13859 non-null  float64
 2   living area                            13859 non-null  int64  
 3   lot area                               13859 non-null  int64  
 4   number of floors                       13859 non-null  float64
 5   waterfront present                     13859 non-null  int64  
 6   condition of the house                 13859 non-null  int64  
 7   grade of the house                     13859 non-null  int64  
 8   Area of the house(excluding basement)  13859 non-null  int64  
 9   Area of the basement                   13859 non-null  int64  
 10  Renovation Year                        13859 non-null  int64  
 11  La

In [54]:
X.nunique()>1

number of bedrooms                       True
number of bathrooms                      True
living area                              True
lot area                                 True
number of floors                         True
waterfront present                       True
condition of the house                   True
grade of the house                       True
Area of the house(excluding basement)    True
Area of the basement                     True
Renovation Year                          True
Lattitude                                True
Longitude                                True
living_area_renov                        True
lot_area_renov                           True
Number of schools nearby                 True
Distance from the airport                True
Year since renovation                    True
dtype: bool

In [55]:
X = X.loc[:,X.nunique()>1]

In [56]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13859 entries, 0 to 13858
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   number of bedrooms                     13859 non-null  int64  
 1   number of bathrooms                    13859 non-null  float64
 2   living area                            13859 non-null  int64  
 3   lot area                               13859 non-null  int64  
 4   number of floors                       13859 non-null  float64
 5   waterfront present                     13859 non-null  int64  
 6   condition of the house                 13859 non-null  int64  
 7   grade of the house                     13859 non-null  int64  
 8   Area of the house(excluding basement)  13859 non-null  int64  
 9   Area of the basement                   13859 non-null  int64  
 10  Renovation Year                        13859 non-null  int64  
 11  La

In [61]:
scaler = StandardScaler()

In [62]:
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [63]:
X_normalized

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,Renovation Year,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Year since renovation
0,0.718714,0.610046,1.687492,0.746264,0.959757,-0.057075,-0.646877,1.383370,2.164078,-0.658252,-0.205387,1.165654,0.575773,2.269381,1.169110,1.210768,1.236614,-0.177835
1,-0.366916,-0.094125,0.918413,-0.271116,0.029728,-0.057075,0.867493,0.423804,0.215045,1.355756,-0.205387,0.819290,-0.571474,0.202462,-0.308117,-1.238336,-1.563202,-0.177835
2,-0.366916,0.610046,0.777415,-0.264509,-0.900300,-0.057075,0.867493,1.383370,-0.030288,1.525612,-0.205387,0.880708,-1.305992,0.715186,-0.298486,-1.238336,0.228680,-0.177835
3,1.804345,1.666302,2.136121,-0.073047,0.959757,-0.057075,-0.646877,2.342936,2.641113,-0.658252,-0.205387,-0.187665,2.471528,2.221313,-0.048436,1.210768,0.788643,-0.177835
4,-0.366916,-0.446210,0.315967,-0.110494,0.959757,-0.057075,2.381862,0.423804,-0.234732,1.016044,-0.205387,0.967120,-0.550488,-0.582647,-0.073976,1.210768,0.676651,-0.177835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13854,-1.452547,-0.798295,-0.560783,0.138500,-0.900300,-0.057075,0.867493,-0.535762,-0.226554,-0.658252,-0.205387,-1.220330,-0.480534,0.506892,0.184433,1.210768,1.236614,-0.177835
13855,-0.366916,-0.094125,-0.401840,-0.205049,0.029728,-0.057075,0.867493,-0.535762,-0.057547,-0.658252,-0.205387,-2.017325,0.072103,-0.630715,-0.193319,1.210768,-0.667261,-0.177835
13856,-1.452547,-1.502466,-1.183737,-0.228304,-0.900300,-0.057075,-0.646877,-1.495328,-0.888953,-0.658252,-0.205387,-0.436190,-0.725373,-1.287642,-0.245710,-0.013784,-0.107298,-0.177835
13857,0.718714,-1.502466,-1.235009,-0.215065,-0.900300,-0.057075,0.867493,-1.495328,-0.943471,-0.658252,-0.205387,-0.530458,-0.053814,-0.822986,-0.226025,1.210768,-1.227224,-0.177835


In [64]:
condition_number = np.linalg.cond(X_normalized)
print("Condition Number (Normalized):", condition_number)

Condition Number (Normalized): 8261881475834470.0


In [65]:
correlation_matrix = X_normalized.corr()

In [74]:
high_corr_pairs = [(col1, col2) for col1 in correlation_matrix.columns
                   for col2 in correlation_matrix.columns
                   if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > 0.9]

In [75]:
print("Highly Correlated Feature Pairs:", high_corr_pairs)

Highly Correlated Feature Pairs: []


In [78]:
correlation_matrix

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,Renovation Year,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Year since renovation
number of bedrooms,1.0,0.486262,0.576949,0.024942,0.154136,-0.029203,0.023747,0.318161,0.45622,0.279976,-0.001178,-0.030734,0.143677,0.368776,0.018421,-0.000453,-0.007217,-0.016606
number of bathrooms,0.486262,1.0,0.714024,0.058897,0.504215,-0.007158,-0.139755,0.612715,0.634677,0.221754,0.022499,0.002876,0.247863,0.53099,0.055978,0.003525,0.00844,-0.00832
living area,0.576949,0.714024,1.0,0.164281,0.341727,0.00328,-0.080572,0.708669,0.853529,0.373494,0.022534,0.018633,0.288068,0.74328,0.169972,8.3e-05,0.004495,0.004976
lot area,0.024942,0.058897,0.164281,1.0,-0.011327,0.03271,-0.004667,0.097238,0.169346,0.009503,0.009203,-0.097221,0.213476,0.144832,0.70114,-0.010289,0.005455,0.015589
number of floors,0.154136,0.504215,0.341727,-0.011327,1.0,-0.011279,-0.286251,0.462364,0.529804,-0.296317,-0.002719,0.039042,0.137975,0.27308,-0.017875,-0.008876,0.014523,-0.004096
waterfront present,-0.029203,-0.007158,0.00328,0.03271,-0.011279,1.0,0.022622,-0.018408,-0.005452,0.015916,0.058818,-0.05093,-0.065871,0.00365,0.03332,-0.008552,-0.001152,0.066159
condition of the house,0.023747,-0.139755,-0.080572,-0.004667,-0.286251,0.022622,1.0,-0.177237,-0.191516,0.188434,-0.060172,-0.00659,-0.114913,-0.124161,-0.00044,-0.009987,-0.001931,-0.021724
grade of the house,0.318161,0.612715,0.708669,0.097238,0.462364,-0.018408,-0.177237,1.0,0.715733,0.06731,-0.022548,0.091302,0.235861,0.679963,0.103086,-0.000772,0.006625,-0.041984
Area of the house(excluding basement),0.45622,0.634677,0.853529,0.169346,0.529804,-0.005452,-0.191516,0.715733,1.0,-0.164551,-0.000606,-0.03869,0.392887,0.725427,0.180829,-0.005731,0.004056,-0.005336
Area of the basement,0.279976,0.221754,0.373494,0.009503,-0.296317,0.015916,0.188434,0.06731,-0.164551,1.0,0.043737,0.104154,-0.154139,0.115573,-0.000168,0.010361,0.001288,0.01892


In [83]:
for col in X_normalized.columns:
    # Separate target (current column) and predictors (all other columns)
    target = X_normalized[col]
    predictors = X_normalized.drop(columns=[col])

    # Fit a linear regression model
    model = LinearRegression().fit(predictors, target)
    r_squared = model.score(predictors, target)

    if r_squared > 0.99:  # Threshold for perfect multicollinearity
        print(f"Feature {col} is highly correlated with other features (R^2 = {r_squared:.2f})")

        # Drop the feature
        X_normalized = X_normalized.drop(columns=[col])

Feature living area is highly correlated with other features (R^2 = 1.00)


In [84]:
X_normalized = X_normalized.drop(columns=["living area"])

KeyError: "['living area'] not found in axis"

In [85]:
X_normalized 

Unnamed: 0,number of bedrooms,number of bathrooms,lot area,number of floors,waterfront present,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,Renovation Year,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Year since renovation
0,0.718714,0.610046,0.746264,0.959757,-0.057075,-0.646877,1.383370,2.164078,-0.658252,-0.205387,1.165654,0.575773,2.269381,1.169110,1.210768,1.236614,-0.177835
1,-0.366916,-0.094125,-0.271116,0.029728,-0.057075,0.867493,0.423804,0.215045,1.355756,-0.205387,0.819290,-0.571474,0.202462,-0.308117,-1.238336,-1.563202,-0.177835
2,-0.366916,0.610046,-0.264509,-0.900300,-0.057075,0.867493,1.383370,-0.030288,1.525612,-0.205387,0.880708,-1.305992,0.715186,-0.298486,-1.238336,0.228680,-0.177835
3,1.804345,1.666302,-0.073047,0.959757,-0.057075,-0.646877,2.342936,2.641113,-0.658252,-0.205387,-0.187665,2.471528,2.221313,-0.048436,1.210768,0.788643,-0.177835
4,-0.366916,-0.446210,-0.110494,0.959757,-0.057075,2.381862,0.423804,-0.234732,1.016044,-0.205387,0.967120,-0.550488,-0.582647,-0.073976,1.210768,0.676651,-0.177835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13854,-1.452547,-0.798295,0.138500,-0.900300,-0.057075,0.867493,-0.535762,-0.226554,-0.658252,-0.205387,-1.220330,-0.480534,0.506892,0.184433,1.210768,1.236614,-0.177835
13855,-0.366916,-0.094125,-0.205049,0.029728,-0.057075,0.867493,-0.535762,-0.057547,-0.658252,-0.205387,-2.017325,0.072103,-0.630715,-0.193319,1.210768,-0.667261,-0.177835
13856,-1.452547,-1.502466,-0.228304,-0.900300,-0.057075,-0.646877,-1.495328,-0.888953,-0.658252,-0.205387,-0.436190,-0.725373,-1.287642,-0.245710,-0.013784,-0.107298,-0.177835
13857,0.718714,-1.502466,-0.215065,-0.900300,-0.057075,0.867493,-1.495328,-0.943471,-0.658252,-0.205387,-0.530458,-0.053814,-0.822986,-0.226025,1.210768,-1.227224,-0.177835


In [86]:
vif = pd.DataFrame()
vif["Features"] = X_normalized.columns
vif["VIF"] = [variance_inflation_factor(X_normalized.values, i) for i in range(X_normalized.shape[1])]


In [87]:
print(vif)

                                 Features       VIF
0                      number of bedrooms  1.622602
1                     number of bathrooms  2.659250
2                                lot area  1.990077
3                        number of floors  1.992967
4                      waterfront present  1.017944
5                  condition of the house  1.132131
6                      grade of the house  2.729923
7   Area of the house(excluding basement)  4.637794
8                    Area of the basement  1.931817
9                         Renovation Year  3.959726
10                              Lattitude  1.067765
11                              Longitude  1.356644
12                      living_area_renov  2.820849
13                         lot_area_renov  2.038319
14               Number of schools nearby  1.001503
15              Distance from the airport  1.001315
16                  Year since renovation  3.932205


In [88]:
corr_matrix = data.corr()
print(corr_matrix["Area of the house(excluding basement)"].sort_values(ascending=False))


Area of the house(excluding basement)    1.000000
living area                              0.853529
living_area_renov                        0.725427
grade of the house                       0.715733
number of bathrooms                      0.634677
Price                                    0.546075
number of floors                         0.529804
Built Year                               0.460671
number of bedrooms                       0.456220
Longitude                                0.392887
lot_area_renov                           0.180829
lot area                                 0.169346
Distance from the airport                0.004056
Renovation Year                         -0.000606
Year since renovation                   -0.005336
waterfront present                      -0.005452
Number of schools nearby                -0.005731
Lattitude                               -0.038690
Area of the basement                    -0.164551
condition of the house                  -0.191516


In [89]:
X_normalized = X_normalized.drop(columns=["Area of the house(excluding basement)"])


In [90]:
vif = pd.DataFrame()
vif["Features"] = X_normalized.columns
vif["VIF"] = [variance_inflation_factor(X_normalized.values, i) for i in range(X_normalized.shape[1])]
print(vif)

                     Features       VIF
0          number of bedrooms  1.418447
1         number of bathrooms  2.499480
2                    lot area  1.980320
3            number of floors  1.941982
4          waterfront present  1.017634
5      condition of the house  1.131806
6          grade of the house  2.441570
7        Area of the basement  1.555350
8             Renovation Year  3.958185
9                   Lattitude  1.065246
10                  Longitude  1.346282
11          living_area_renov  2.231015
12             lot_area_renov  2.033609
13   Number of schools nearby  1.001475
14  Distance from the airport  1.001294
15      Year since renovation  3.931211


In [91]:
X_normalized = X_normalized.drop(columns=["Renovation Year"])

In [92]:
vif = pd.DataFrame()
vif["Features"] = X_normalized.columns
vif["VIF"] = [variance_inflation_factor(X_normalized.values, i) for i in range(X_normalized.shape[1])]
print(vif)

                     Features       VIF
0          number of bedrooms  1.418402
1         number of bathrooms  2.491651
2                    lot area  1.980216
3            number of floors  1.939319
4          waterfront present  1.017609
5      condition of the house  1.121816
6          grade of the house  2.440119
7        Area of the basement  1.553871
8                   Lattitude  1.065200
9                   Longitude  1.345292
10          living_area_renov  2.223780
11             lot_area_renov  2.033372
12   Number of schools nearby  1.001468
13  Distance from the airport  1.001085
14      Year since renovation  1.012873


In [None]:
X_normalized.to_csv("C