In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load and preprocess the data
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

# Feature engineering and encoding
df['agePossession'] = df['agePossession'].replace({
    'Relatively New':'new',
    'Moderately Old':'old',
    'New Property' : 'new',
    'Old Property' : 'old',
    'Under Construction' : 'under construction'
})
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

# Define features and target
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform the target

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial features to capture nonlinear relationships
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Model training with Ridge and Lasso Regression
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01)
}

for name, model in models.items():
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_poly, y, cv=kfold, scoring='r2')
    print(f"{name} Regression - Mean R^2: {scores.mean():.4f}, Std: {scores.std():.4f}")

# Fit the best model (e.g., Ridge) and analyze coefficients
best_model = Lasso(alpha=0.01)
best_model.fit(X_poly, y)

# Extract and display coefficients for bedrooms
coefficients = best_model.coef_
bedroom_index = list(poly.get_feature_names_out(X.columns)).index('bedRoom')
print(f"Coefficient for bedRoom: {coefficients[bedroom_index]}")


Ridge Regression - Mean R^2: 0.7708, Std: 0.0757
Lasso Regression - Mean R^2: 0.8812, Std: 0.0130
Coefficient for bedRoom: 0.04794908696576447


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Feature interaction term between bedrooms and other significant features
df['bedroom_area_interaction'] = df['bedRoom'] * df['built_up_area']
df['bedroom_luxury_interaction'] = df['bedRoom'] * df['luxury_category']

# Recreate features and target
X = df.drop(columns=['price'])
y = np.log1p(df['price'])

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial features to allow non-linear interactions
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Train a Ridge model again
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_poly, y)

# Check the new coefficient for bedRoom
bedroom_index = list(poly.get_feature_names_out(X.columns)).index('bedRoom')
print(f"Ridge Regression - Coefficient for bedRoom: {ridge_model.coef_[bedroom_index]}")

# RandomForest to check feature importance
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Calculate feature importances
importances = rf_model.feature_importances_
features = X.columns

# Display feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

# Predict and evaluate
y_pred = rf_model.predict(X)
r2 = r2_score(y, y_pred)
print(f"Random Forest R^2: {r2:.4f}")



Ridge Regression - Coefficient for bedRoom: 0.010624327300259839
                      Feature  Importance
3               built_up_area    0.498969
112  bedroom_area_interaction    0.259379
0               property_type    0.055280
2                    bathroom    0.022053
4                servant room    0.011454
..                        ...         ...
30           sector_sector 21    0.000050
90            sector_sector 8    0.000041
91           sector_sector 80    0.000041
99           sector_sector 88    0.000011
36           sector_sector 27    0.000002

[114 rows x 2 columns]
Random Forest R^2: 0.9829


In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split

# Load and preprocess the data
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

# Feature engineering and encoding
df['agePossession'] = df['agePossession'].replace({
    'Relatively New':'new',
    'Moderately Old':'old',
    'New Property' : 'new',
    'Old Property' : 'old',
    'Under Construction' : 'under construction'
})
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

# Define features and target
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform the target

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial features to allow non-linear interactions
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Train a Ridge model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_poly, y)

# Get the feature names from PolynomialFeatures
feature_names = poly.get_feature_names_out(X.columns)

# Extract coefficients from the Ridge model
coefficients = ridge_model.coef_

# Create a DataFrame to pair features with their coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort by coefficient value (optional)
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)

# Display the DataFrame
print(coef_df.reset_index(drop=True))


                               Feature  Coefficient
0                        built_up_area     0.284886
1       built_up_area sector_sector 31     0.063659
2                              bedRoom     0.062856
3                             bathroom     0.053081
4       built_up_area sector_sector 36     0.050758
...                                ...          ...
6435          bedRoom sector_sector 54    -0.036697
6436   property_type sector_sector 111    -0.037452
6437    built_up_area sector_sector 93    -0.051150
6438  luxury_category sector_sector 55    -0.068057
6439    built_up_area sector_sector 55    -0.104218

[6440 rows x 2 columns]


In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Load and preprocess the data
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room', 'floor_category', 'balcony'])

# Feature engineering and encoding
df['agePossession'] = df['agePossession'].replace({
    'Relatively New': 'new',
    'Moderately Old': 'old',
    'New Property': 'new',
    'Old Property': 'old',
    'Under Construction': 'under construction'
})
df['property_type'] = df['property_type'].replace({'flat': 0, 'house': 1})
df['luxury_category'] = df['luxury_category'].replace({'Low': 0, 'Medium': 1, 'High': 2})

# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

# Define features and target
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform the target

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial features to capture non-linear interactions
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Train the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_poly, y)

# Extract feature names from PolynomialFeatures
feature_names = poly.get_feature_names_out(X.columns)

# Extract coefficients from the trained model
coefficients = ridge_model.coef_

# Create a DataFrame to pair features with their coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the coefficients by absolute value (optional)
coef_df['abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='abs_Coefficient', ascending=False)

# Display the top 10 most impactful features (positive and negative)
top_positive = coef_df[coef_df['Coefficient'] > 0].head(10)
top_negative = coef_df[coef_df['Coefficient'] < 0].head(10)

# Print insights
print("Top 10 Positive Impact Features:")
print(top_positive[['Feature', 'Coefficient']])

print("\nTop 10 Negative Impact Features:")
print(top_negative[['Feature', 'Coefficient']])


Top 10 Positive Impact Features:
                             Feature  Coefficient
3                      built_up_area     0.284886
482   built_up_area sector_sector 31     0.063659
1                            bedRoom     0.062856
2                           bathroom     0.053081
484   built_up_area sector_sector 36     0.050758
394        bathroom sector_sector 54     0.044417
405       bathroom sector_sector 63a     0.043926
185   property_type sector_sector 65     0.039906
326         bedRoom sector_sector 90     0.039281
458  built_up_area sector_sector 107     0.038742

Top 10 Negative Impact Features:
                              Feature  Coefficient
504    built_up_area sector_sector 55    -0.104218
825  luxury_category sector_sector 55    -0.068057
548    built_up_area sector_sector 93    -0.051150
133   property_type sector_sector 111    -0.037452
284          bedRoom sector_sector 54    -0.036697
445                   built_up_area^2    -0.033774
269          bedRoom secto

In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# Load and preprocess the data
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room', 'floor_category', 'balcony'])

# Feature engineering and encoding
df['agePossession'] = df['agePossession'].replace({
    'Relatively New': 'new',
    'Moderately Old': 'old',
    'New Property': 'new',
    'Old Property': 'old',
    'Under Construction': 'under construction'
})
df['property_type'] = df['property_type'].replace({'flat': 0, 'house': 1})
df['luxury_category'] = df['luxury_category'].replace({'Low': 0, 'Medium': 1, 'High': 2})

# One-Hot Encoding for categorical variables
df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

# Define features and target
X = df.drop(columns=['price'])
y = np.log1p(df['price'])  # Log-transform the target

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the Ridge Regression model without polynomial expansion
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_scaled, y)

# Extract feature names
feature_names = X.columns

# Extract coefficients from the trained model
coefficients = ridge_model.coef_

# Create a DataFrame to pair features with their coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the coefficients by absolute value (optional)
coef_df['abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='abs_Coefficient', ascending=False)

# Display the top 10 most impactful features (positive and negative)
top_positive = coef_df[coef_df['Coefficient'] > 0].head(10)
top_negative = coef_df[coef_df['Coefficient'] < 0].head(10)

# Print insights
print("Top 10 Positive Impact Features:")
print(top_positive[['Feature', 'Coefficient']])

print("\nTop 10 Negative Impact Features:")
print(top_negative[['Feature', 'Coefficient']])


Top 10 Positive Impact Features:
             Feature  Coefficient
3      built_up_area     0.210568
0      property_type     0.120132
35  sector_sector 26     0.072540
57  sector_sector 50     0.070160
73  sector_sector 65     0.069083
2           bathroom     0.065102
50  sector_sector 43     0.058758
34  sector_sector 25     0.056264
1            bedRoom     0.054062
54  sector_sector 48     0.052589

Top 10 Negative Impact Features:
               Feature  Coefficient
109  sector_sohna road    -0.032136
105   sector_sector 92    -0.030629
107   sector_sector 95    -0.026769
8       sector_manesar    -0.023883
47     sector_sector 4    -0.022176
24    sector_sector 12    -0.020642
14   sector_sector 105    -0.020330
56     sector_sector 5    -0.018188
67     sector_sector 6    -0.016447
19    sector_sector 11    -0.015807
