In [12]:
import pandas as pd

# Load the dataset
train = pd.read_csv("Engineered/train_with_engineered_features_Refined.csv")

# Vesion 1: Compute Pearson correlations with SalePrice
corr_matrix = train.corr(numeric_only=True)
saleprice_corr = corr_matrix['SalePrice'].drop('SalePrice').sort_values(ascending=False)

# Greedy selection with multicollinearity check
top10_corr_features = []
correlation_threshold = 0.8

for feature in saleprice_corr.index:
    too_similar = False
    for selected in top10_corr_features:
        if abs(corr_matrix.loc[feature, selected]) >= correlation_threshold:
            too_similar = True
            break
    if not too_similar:
        top10_corr_features.append(feature)
    if len(top10_corr_features) == 10:
        break

# Show final selected features
print("Selected top10_corr_features (filtered for multicollinearity):")
for f in top10_corr_features:
    print(f)


Selected top10_corr_features (filtered for multicollinearity):
OverallQual
GrLivArea
GarageCars
TotalBsmtSF
FullBath
YearBuilt
YearRemodAdd
MasVnrArea
Fireplaces
OpenPorchSF_log


In [5]:
# Add SalePrice to the selected feature list
selected_features_with_target = selected_features + ['SalePrice']

# Create the final dataset
train_linear_ready = train[selected_features_with_target]

# Save to CSV
train_linear_ready.to_csv("Engineered/train_linear_ready.csv", index=False)
print("Saved: Engineered/train_linear_ready.csv")


Saved: Engineered/train_linear_ready.csv


In [13]:
#  Version 2: Top 10 XGBoost feature importance
# Train an XGBoost model using default settings (n_estimators set to 100 for speed)
from xgboost import XGBRegressor

# Load engineered dataset
train = pd.read_csv("Engineered/train_with_engineered_features_Refined.csv")

# Define X and y
X = train.drop(columns="SalePrice")
y = train["SalePrice"]

# Drop non-numeric columns to avoid XGBoost dtype error
X = X.select_dtypes(include=["int64", "float64"])

# Train model
model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Get top 10 feature importances
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

top10_xgb_features = importance_df.head(10)["feature"].tolist()
X_v2 = X[top10_xgb_features]

# Show features
print("top10_xgb_features:")
for f in top10_xgb_features:
    print(f)



top10_xgb_features:
OverallQual
GarageCars
KitchenAbvGr
GrLivArea
FullBath
2ndFlrSF
TotalBsmtSF
1stFlrSF
Fireplaces
OpenPorchSF


In [17]:
# Add SalePrice to the selected XGBoost feature list
xgb_features_with_target = top10_xgb_features + ['SalePrice']

# Create the final XGBoost dataset
train_xgb_ready = train[xgb_features_with_target]

# Save to CSV
train_xgb_ready.to_csv("Engineered/train_xgb_ready.csv", index=False)
print("Saved: Engineered/train_xgb_ready.csv")


Saved: Engineered/train_xgb_ready.csv


In [16]:
# Display the two feature sets side-by-side for comparison
comparison_df = pd.DataFrame({
    "Top Pearson Features": pd.Series(top10_corr_features),
    "Top XGBoost Features": pd.Series(top10_xgb_features)
})

comparison_df


Unnamed: 0,Top Pearson Features,Top XGBoost Features
0,OverallQual,OverallQual
1,GrLivArea,GarageCars
2,GarageCars,KitchenAbvGr
3,TotalBsmtSF,GrLivArea
4,FullBath,FullBath
5,YearBuilt,2ndFlrSF
6,YearRemodAdd,TotalBsmtSF
7,MasVnrArea,1stFlrSF
8,Fireplaces,Fireplaces
9,OpenPorchSF_log,OpenPorchSF
