In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1. Load your dataset
# Replace 'your_data.csv' with the actual file name
df = pd.read_csv('data/train_input.csv') 

# Assuming your price column is named 'SalePrice'
TARGET_COLUMN = 'Sale Price' 
N_FEATURES_TO_SELECT = 10 

# Separate features (X) and target (y)
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Identify Numerical and Categorical Columns
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# 2. Create Preprocessing Pipelines for Robustness
# We must handle missing values and encode categorical data BEFORE feature selection.

# Pipeline for Numerical Features: Impute (fill NaN) then Scale
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for Categorical Features: Impute (fill NaN) then One-Hot Encode
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # 'handle_unknown' prevents errors on new categories
])

# Create a Column Transformer to apply pipelines to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough',
    # Keep any other columns untouched (if any)
)

# 3. Apply the Preprocessing
X_processed = preprocessor.fit_transform(X)

X_processed = X_processed.toarray()
# Get the feature names after one-hot encoding
feature_names = (
    numerical_features + 
    list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))
)

# Convert back to DataFrame for easier handling
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

In [None]:
# 4. Train a Random Forest Model
# A robust ensemble model is ideal for generating feature importance.
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_processed_df, y)

# 5. Extract Feature Importances
importances = model.feature_importances_

# Create a Series for clear ranking
feature_importance_df = pd.Series(importances, index=X_processed_df.columns)

# 6. Select the Top N Features
top_features = feature_importance_df.nlargest(N_FEATURES_TO_SELECT)

print(f"--- Top {N_FEATURES_TO_SELECT} Most Important Features for Price Prediction ---")
print(top_features)

# Final list of features (un-encoded names may need manual review if they are 'one-hot' parts)
final_feature_list = top_features.index.tolist()

print("\nFinal List of Top Features:")
print(final_feature_list)

--- Top 10 Most Important Features for Price Prediction ---
Overall Qual        0.597022
Gr Liv Area         0.075489
Garage Cars         0.058234
Garage Area         0.030621
1st Flr SF          0.029067
Total Bsmt SF       0.028809
Lot Area            0.014023
BsmtFin SF 1        0.012783
Full Bath           0.009657
year_since_remod    0.007524
dtype: float64

Final List of Top Features:
['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Garage Area', '1st Flr SF', 'Total Bsmt SF', 'Lot Area', 'BsmtFin SF 1', 'Full Bath', 'year_since_remod']


: 

In [None]:
df2 = df[final_feature_list]

: 

In [None]:
df2

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Cars,Garage Area,1st Flr SF,Total Bsmt SF,Lot Area,BsmtFin SF 1,Full Bath,year_since_remod
0,5,7.098376,2.0,504.0,7.098376,1209.0,9900,6.966024,1,44
1,5,7.102499,1.0,318.0,7.102499,1214.0,10355,6.545350,2,40
2,3,6.593045,0.0,0.0,6.593045,270.0,4130,0.000000,1,5
3,5,7.085064,2.0,501.0,7.085064,1153.0,13110,6.870053,2,34
4,6,7.064759,2.0,402.0,7.064759,1160.0,8076,6.559615,2,12
...,...,...,...,...,...,...,...,...,...,...
2339,6,7.201171,1.0,440.0,6.705639,780.0,6430,6.660575,1,56
2340,8,7.935587,3.0,810.0,7.307202,1462.0,15138,6.536692,2,13
2341,5,6.740519,1.0,264.0,6.740519,845.0,6600,6.459904,1,3
2342,5,7.117206,2.0,490.0,7.117206,1232.0,9600,6.549651,1,44


: 

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Assuming feature_names is the list of column names after OHE
# Assuming importances is the model.feature_importances_ array

grouped_importance = {}

for feature_name, score in feature_importance_df.items():
    # Example: Group all columns that start with 'Category_'
    if feature_name.startswith('Category_'):
        group_key = 'Original_Category'
        grouped_importance[group_key] = grouped_importance.get(group_key, 0) + score
    else:
        # Keep non-OHE (numeric) features as they are
        grouped_importance[feature_name] = score

# The value for 'Original_Category' is its total importance.

: 

In [10]:
x1_processed_df = X_processed_df

In [11]:
x1_processed_df['Sale Price'] = df['Sale Price']

In [12]:
all_columns = ['Order', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Mo Sold', 'Yr Sold', 'Age', 'year_since_remod', 'MS Zoning_A (agr)', 'MS Zoning_C (all)', 'MS Zoning_FV', 'MS Zoning_I (all)', 'MS Zoning_RH', 'MS Zoning_RL', 'MS Zoning_RM', 'Street_Grvl', 'Street_Pave', 'Lot Shape_IR1', 'Lot Shape_IR2', 'Lot Shape_IR3', 'Lot Shape_Reg', 'Land Contour_Bnk', 'Land Contour_HLS', 'Land Contour_Low', 'Land Contour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'Utilities_NoSewr', 'Lot Config_Corner', 'Lot Config_CulDSac', 'Lot Config_FR2', 'Lot Config_FR3', 'Lot Config_Inside', 'Land Slope_Gtl', 'Land Slope_Mod', 'Land Slope_Sev', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_Greens', 'Neighborhood_GrnHill', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm', 'Condition 1_PosA', 'Condition 1_PosN', 'Condition 1_RRAe', 'Condition 1_RRAn', 'Condition 1_RRNe', 'Condition 1_RRNn', 'Condition 2_Artery', 'Condition 2_Feedr', 'Condition 2_Norm', 'Condition 2_PosA', 'Condition 2_PosN', 'Condition 2_RRNn', 'Bldg Type_1Fam', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', 'House Style_1.5Fin', 'House Style_1.5Unf', 'House Style_1Story', 'House Style_2.5Fin', 'House Style_2.5Unf', 'House Style_2Story', 'House Style_SFoyer', 'House Style_SLvl', 'Roof Style_Flat', 'Roof Style_Gable', 'Roof Style_Gambrel', 'Roof Style_Hip', 'Roof Style_Mansard', 'Roof Style_Shed', 'Roof Matl_ClyTile', 'Roof Matl_CompShg', 'Roof Matl_Membran', 'Roof Matl_Metal', 'Roof Matl_Roll', 'Roof Matl_Tar&Grv', 'Roof Matl_WdShake', 'Roof Matl_WdShngl', 'Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm', 'Exterior 1st_BrkFace', 'Exterior 1st_CBlock', 'Exterior 1st_CemntBd', 'Exterior 1st_HdBoard', 'Exterior 1st_MetalSd', 'Exterior 1st_Plywood', 'Exterior 1st_PreCast', 'Exterior 1st_Stone', 'Exterior 1st_Stucco', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng', 'Exterior 1st_WdShing', 'Exterior 2nd_AsbShng', 'Exterior 2nd_AsphShn', 'Exterior 2nd_Brk Cmn', 'Exterior 2nd_BrkFace', 'Exterior 2nd_CBlock', 'Exterior 2nd_CmentBd', 'Exterior 2nd_HdBoard', 'Exterior 2nd_ImStucc', 'Exterior 2nd_MetalSd', 'Exterior 2nd_Other', 'Exterior 2nd_Plywood', 'Exterior 2nd_PreCast', 'Exterior 2nd_Stone', 'Exterior 2nd_Stucco', 'Exterior 2nd_VinylSd', 'Exterior 2nd_Wd Sdng', 'Exterior 2nd_Wd Shng', 'Mas Vnr Type_BrkCmn', 'Mas Vnr Type_BrkFace', 'Mas Vnr Type_CBlock', 'Mas Vnr Type_Stone', 'Exter Qual_Ex', 'Exter Qual_Fa', 'Exter Qual_Gd', 'Exter Qual_TA', 'Exter Cond_Ex', 'Exter Cond_Fa', 'Exter Cond_Gd', 'Exter Cond_Po', 'Exter Cond_TA', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'Bsmt Qual_Ex', 'Bsmt Qual_Fa', 'Bsmt Qual_Gd', 'Bsmt Qual_Po', 'Bsmt Qual_TA', 'Bsmt Cond_Ex', 'Bsmt Cond_Fa', 'Bsmt Cond_Gd', 'Bsmt Cond_Po', 'Bsmt Cond_TA', 'Bsmt Exposure_Av', 'Bsmt Exposure_Gd', 'Bsmt Exposure_Mn', 'Bsmt Exposure_No', 'BsmtFin Type 1_ALQ', 'BsmtFin Type 1_BLQ', 'BsmtFin Type 1_GLQ', 'BsmtFin Type 1_LwQ', 'BsmtFin Type 1_Rec', 'BsmtFin Type 1_Unf', 'BsmtFin Type 2_ALQ', 'BsmtFin Type 2_BLQ', 'BsmtFin Type 2_GLQ', 'BsmtFin Type 2_LwQ', 'BsmtFin Type 2_Rec', 'BsmtFin Type 2_Unf', 'Heating_Floor', 'Heating_GasA', 'Heating_GasW', 'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'Heating QC_Ex', 'Heating QC_Fa', 'Heating QC_Gd', 'Heating QC_Po', 'Heating QC_TA', 'Central Air_N', 'Central Air_Y', 'Electrical_FuseA', 'Electrical_FuseF', 'Electrical_FuseP', 'Electrical_Mix', 'Electrical_SBrkr', 'Kitchen Qual_Ex', 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA', 'Functional_Maj1', 'Functional_Maj2', 'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Sal', 'Functional_Sev', 'Functional_Typ', 'Fireplace Qu_Ex', 'Fireplace Qu_Fa', 'Fireplace Qu_Gd', 'Fireplace Qu_Po', 'Fireplace Qu_TA', 'Garage Type_2Types', 'Garage Type_Attchd', 'Garage Type_Basment', 'Garage Type_BuiltIn', 'Garage Type_CarPort', 'Garage Type_Detchd', 'Garage Finish_Fin', 'Garage Finish_RFn', 'Garage Finish_Unf', 'Garage Qual_Ex', 'Garage Qual_Fa', 'Garage Qual_Gd', 'Garage Qual_Po', 'Garage Qual_TA', 'Garage Cond_Ex', 'Garage Cond_Fa', 'Garage Cond_Gd', 'Garage Cond_Po', 'Garage Cond_TA', 'Paved Drive_N', 'Paved Drive_P', 'Paved Drive_Y', 'Sale Type_COD', 'Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth', 'Sale Type_VWD', 'Sale Type_WD ', 'Sale Condition_Abnorml', 'Sale Condition_AdjLand', 'Sale Condition_Alloca', 'Sale Condition_Family', 'Sale Condition_Normal', 'Sale Condition_Partial']


feature_groups = {}
numeric_features = []

for col in all_columns:
    if '_' in col:
        # It's an OHE column. Group it by the original feature name (before the first underscore).
        original_feature = col.split('_')[0]
        
        # NOTE: Some features have underscores in their values (e.g., 'Roof Matl_Tar&Grv' or 'Sale Type_WD ').
        # We need to ensure we capture the full original name, not just the part before the first underscore.
        # A more robust approach for this specific dataset: 
        # Find the first capital letter that follows a space, or simply rely on the first segment.
        
        # Simple split on the first underscore (usually works for Ames Housing dataset)
        original_feature = col.split('_', 1)[0]
        
        # Handle cases where the original name has an underscore (like 'BsmtFin Type 1')
        # We'll rely on the simple split for this code to stay clean, assuming standard OHE practices.

        if original_feature not in feature_groups:
            feature_groups[original_feature] = []
        feature_groups[original_feature].append(col)
    else:
        # It's a numeric/ordinal feature. Treat it as its own group.
        # Add it to the dictionary where the key is the feature name and the value is a list containing itself.
        feature_groups[col] = [col]


# Optional: Verify the grouping by printing the dictionary keys (the feature names)
print("--- Grouped Feature Names (Keys) ---")
print(feature_groups.keys())

print("\n--- Example Group (MS Zoning) ---")
print(feature_groups.get('MS Zoning'))

# Now, use this 'feature_groups' dictionary in the PermutationImportance step!
# perm_imp = PermutationImportance(model, ..., feature_names=feature_groups).fit(X_test, y_test)


--- Grouped Feature Names (Keys) ---
dict_keys(['Order', 'PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Mo Sold', 'Yr Sold', 'Age', 'year', 'MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functiona

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from eli5.sklearn import PermutationImportance
import eli5
from IPython.display import display # For displaying the results in a notebook

# ----------------------------------------------------------------------
# ASSUMPTION: Replace this with your actual DataFrame loading and cleaning
# ----------------------------------------------------------------------
# Since you didn't provide the DataFrame, we'll create a synthetic one 
# using a small subset of your features for the code to be runnable.

# Sample features from your list
sample_features = ['Lot Area', 'Overall Qual', 'MS Zoning_RL', 'MS Zoning_RM', 
                   'Neighborhood_CollgCr', 'Neighborhood_OldTown']
target_col = 'Sale Price'
num_rows = 500

# Create a synthetic DataFrame (REPLACE THIS WITH YOUR ACTUAL DF)

df = x1_processed_df

# Ensure the columns in X match your full list of columns exactly for real data
X = df.drop(target_col, axis=1) 
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ----------------------------------------------------------------------
# 1. Feature Grouping (Using the same logic as the previous response)
#    This step must be run on the actual columns of X_train/X_test
# ----------------------------------------------------------------------

feature_groups = {}
all_columns = X.columns.tolist()

for col in all_columns:
    if '_' in col:
        # Group by the original feature name (before the first underscore)
        # Using maxsplit=1 to handle names with underscores in the category value
        original_feature = col.split('_', 1)[0]
        
        # NOTE: If your original feature name had spaces (e.g., 'MS Zoning'), 
        # this will correctly group all 'MS Zoning_' columns.

        if original_feature not in feature_groups:
            feature_groups[original_feature] = []
        feature_groups[original_feature].append(col)
    else:
        # Numeric/Ordinal features are groups of one
        feature_groups[col] = [col]

# ----------------------------------------------------------------------
# 2. Train Model and Calculate Grouped Permutation Importance
# ----------------------------------------------------------------------

# Initialize and train a robust model (Random Forest is common for importance)
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print(f"Model R^2 on test set: {model.score(X_test, y_test):.4f}\n")

# Initialize Permutation Importance object with the feature_groups
perm_imp = PermutationImportance(
    model, 
    random_state=42, 
    # n_iter=10 is the default. You might increase this for more stable results (e.g., 50)
).fit(X_test, y_test)

# ----------------------------------------------------------------------
# 3. Display Results
# ----------------------------------------------------------------------

print("--- Grouped Permutation Feature Importance ---")

# Use eli5.show_weights to display the grouped results clearly
# The 'weight' shows the total drop in model performance (R2 score in this case) 
# when the feature group is shuffled.

# Display the report using the feature_groups dictionary
html_report = eli5.show_weights(
    perm_imp, 
    feature_names=feature_groups,
    target_names=[target_col] # Optional: specify the target name
)

display(html_report) # Displays the interactive HTML table in a notebook/VS Code

Model R^2 on test set: 0.8750

--- Grouped Permutation Feature Importance ---


ValueError: feature_names has a wrong length: expected=282, got=74