In [None]:

# 📦 Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from scipy import stats

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
sns.set(style="whitegrid")

# 📥 Load Dataset & Cleaning
df = pd.read_csv("/content/kc_house_data.csv")
df = df.drop_duplicates()
df = df[(df['bedrooms'] > 0) & (df['bathrooms'] > 0)]
df = df.drop(columns=['id', 'date'])

# 🎯 Features and Target
X = df.drop(columns='price')
y = df['price']

# 🔢 Numerical Features
num_features = X.select_dtypes(include=np.number).columns.tolist()
print("Numerical features:", num_features)

# 🧠 Categorical Features
cat_features = X.select_dtypes(include='object').columns.tolist()
print("Categorical features:", cat_features)

# 📈 Check Normality
print("\n--- Checking Normality of Numerical Features ---")
for col in num_features:
    plt.figure(figsize=(6,3))
    sns.histplot(X[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()
    
    k2, p = stats.normaltest(X[col])
    print(f"{col}: p = {p:.3f} => {'Non-normal' if p < 0.05 else 'Normal'}")

print("\n✅ Numeric Features Conclusion: Most are not normally distributed (right-skewed)")

# 📊 Correlation with Price
corr_matrix = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix[['price']].sort_values(by='price', ascending=False), annot=True, cmap='coolwarm')
plt.title("Correlation with Price")
plt.show()

# 🌲 Feature Importance with Random Forest
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X, y)
rf_importances = pd.Series(model_rf.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10,6))
rf_importances.plot(kind='bar')
plt.title("Feature Importances from Random Forest")
plt.show()

# 🌳 Decision Tree Feature Importance
dtree = DecisionTreeRegressor(random_state=0)
dtree.fit(X[num_features], y)
dt_importances = pd.Series(dtree.feature_importances_, index=num_features).sort_values(ascending=False)

plt.figure(figsize=(10,5))
dt_importances.plot(kind='bar')
plt.title("Decision Tree Feature Importances")
plt.show()

# 🔍 Recursive Feature Elimination (RFE)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=10)
selector = selector.fit(X_scaled, y)

rfe_features = pd.Series(selector.support_, index=X.columns)
selected_rfe = rfe_features[rfe_features == True].index.tolist()

print("\n✅ Selected features by RFE:")
print(selected_rfe)

# 🧠 Mutual Information - Categorical
if cat_features:
    X_encoded = pd.get_dummies(X[cat_features], drop_first=True)
    mi_cat = mutual_info_regression(X_encoded, y)
    mi_cat_series = pd.Series(mi_cat, index=X_encoded.columns).sort_values(ascending=False)

    plt.figure(figsize=(10,5))
    mi_cat_series.plot(kind='bar')
    plt.title("Mutual Information - Categorical Features")
    plt.show()

    print("\n✅ Categorical Features Conclusion:")
    print("Top categorical features based on mutual information:")
    print(mi_cat_series.head())
else:
    print("\nNo categorical features found in the dataset.")

# 🔢 Mutual Information - Numerical
mi_num = mutual_info_regression(X[num_features], y)
mi_num_series = pd.Series(mi_num, index=num_features).sort_values(ascending=False)

plt.figure(figsize=(10,5))
mi_num_series.plot(kind='bar')
plt.title("Mutual Information - Numerical Features")
plt.show()

print("\n✅ Numerical Features Conclusion:")
print("Top numerical features based on mutual information:")
print(mi_num_series.head())

# 📌 Final Summary
print("\n📌 Final Feature Selection Summary")
print("✔️ Top features from Random Forest:")
print(rf_importances.head())
print("\n✔️ Top features from Decision Tree:")
print(dt_importances.head())
print("\n✔️ Top features selected by RFE:")
print(selected_rfe)
