In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import plotly.express as px
import plotly.graph_objects as go
import shap

def train_model(df):
    # Separate features
    numeric_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
    
    categorical_features = ['ocean_proximity'] 
    
    # Create preprocessing pipelines for numeric and categorical features
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(drop='first', sparse=False))
    ])
    
    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Create a pipeline with preprocessor and model
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    
    # Prepare data
    X = df[numeric_features + categorical_features]
    y = df['median_house_value']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit pipeline
    model.fit(X_train, y_train)
    
    return model, X_train, X_test

def plot_feature_importance(model, feature_names):
    """Generate feature importance plot"""
    # Extract the actual model from the pipeline
    if hasattr(model, 'named_steps'):
        # Get the final estimator
        rf_model = model.named_steps['regressor']
    else:
        rf_model = model
    
    # Get feature names after preprocessing
    if hasattr(model, 'named_steps'):
        preprocessor = model.named_steps['preprocessor']
        # Get transformed feature names
        numeric_features = preprocessor.named_transformers_['num'].get_feature_names_out()
        categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out() if len(preprocessor.named_transformers_['cat'].get_feature_names_out()) > 0 else []
        transformed_feature_names = list(numeric_features) + list(categorical_features)
    else:
        transformed_feature_names = feature_names
    
    importances = rf_model.feature_importances_
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=transformed_feature_names,
        y=importances,
        name='Feature Importance'
    ))
    
    fig.update_layout(
        title='Feature Importance',
        xaxis_title='Features',
        yaxis_title='Importance Score',
        xaxis_tickangle=-45  # Angle feature names for better readability
    )
    return fig

def plot_shap_values(model, X_sample, feature_names):
    """Generate SHAP values plot for a specific instance"""
    # Extract the actual model from the pipeline
    if hasattr(model, 'named_steps'):
        # Get the final estimator
        rf_model = model.named_steps['regressor']
    else:
        rf_model = model
    
    # Transform the data using the pipeline's preprocessor
    if hasattr(model, 'named_steps'):
        preprocessor = model.named_steps['preprocessor']
        X_transformed = model.named_steps['preprocessor'].transform(X_sample)
        # Get transformed feature names
        numeric_features = preprocessor.named_transformers_['num'].get_feature_names_out()
        categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out() if len(preprocessor.named_transformers_['cat'].get_feature_names_out()) > 0 else []
        transformed_feature_names = list(numeric_features) + list(categorical_features)
    else:
        transformed_feature_names = feature_names
        X_transformed = X_sample

    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_transformed)
    
    fig = go.Figure()
    fig.add_trace(go.Waterfall(
        name='SHAP values',
        orientation='h',
        y=feature_names,
        x=shap_values[0],
        connector={'mode': 'spanning'}
    ))
    
    fig.update_layout(
        title='SHAP Values for Selected Instance',
        xaxis_title='Impact on prediction',
        showlegend=False
    )
    return fig

def plot_partial_dependence(model, X, feature_name, num_points=50):
    """Generate partial dependence plot for a specific feature"""
    # Get preprocessor and final estimator from pipeline
    if hasattr(model, 'named_steps'):
        preprocessor = model.named_steps['preprocessor']
        rf_model = model.named_steps['regressor']
        
        # Check if feature is numeric or categorical
        numeric_features = preprocessor.transformers_[0][2]  # Get numeric feature names
        categorical_features = preprocessor.transformers_[1][2]  # Get categorical feature names
        
        if feature_name in numeric_features:
            # Handle numeric feature
            feature_values = np.linspace(X[feature_name].min(), X[feature_name].max(), num_points)
            predictions = []
            
            for value in feature_values:
                X_modified = X.copy()
                X_modified[feature_name] = value
                pred = model.predict(X_modified)
                predictions.append(pred.mean())
                
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=feature_values,
                y=predictions,
                mode='lines',
                name='Partial Dependence'
            ))
            
            title = f'Partial Dependence Plot for {feature_name}'
            x_label = feature_name
            
        else:
            # Handle categorical feature
            feature_values = X[feature_name].unique()
            predictions = []
            
            for value in feature_values:
                X_modified = X.copy()
                X_modified[feature_name] = value
                pred = model.predict(X_modified)
                predictions.append(pred.mean())
            
            fig = go.Figure()
            fig.add_trace(go.Bar(
                x=feature_values,
                y=predictions,
                name='Partial Dependence'
            ))
            
            title = f'Partial Dependence Plot for {feature_name}'
            x_label = feature_name
        
        fig.update_layout(
            title=title,
            xaxis_title=x_label,
            yaxis_title='Predicted House Value',
            xaxis_tickangle=-45 if feature_name in categorical_features else 0
        )
        
        return fig
    else:
        raise ValueError("Model must be a scikit-learn pipeline with preprocessor and regressor steps")

In [54]:
housing_df = pd.read_csv('housing.csv')
housing_df = housing_df.dropna()

initial_data = housing_df.to_json(date_format='iso', orient='split')

housing_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [55]:
model, X_train, X_test = train_model(housing_df)


plot_feature_importance(model, X_test.columns)

#plot_shap_values(model, X_test.iloc[0:1], X_test.columns)

#plot_partial_dependence(model, X_test, 'housing_median_age')



`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



In [56]:
plot_shap_values(model, X_test.iloc[0:1], X_test.columns)

In [57]:
plot_partial_dependence(model, X_test, 'housing_median_age')