In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [4]:
df=pd.read_csv("C:\\Users\\Sarrang\\p2p\\treated.csv")

In [5]:
df.shape

(176563, 88)

In [6]:
df.columns.get_loc('ProbabilityOfDefault')

38

In [7]:
# Assuming df is your DataFrame
# You want to exclude columns 38 and 41

# Get all column indices
all_columns = list(range(df.shape[1]))

# Exclude columns 38 and 41 which are probability of default and rating
exclude_columns = [38, 41]
included_columns = [col_idx for col_idx in all_columns if col_idx not in exclude_columns]

# Select columns using iloc
# X= df.iloc[:, included_columns].values
X= df.iloc[:, included_columns]
df_idx=df.iloc[:, included_columns]

### experimenting with rating as dependent variable 

In [8]:
# y=df.iloc[:,38].values
y=df.iloc[:,38]

In [9]:
 # excluded 'ProbabilityOfDefault'
numerical=['BidsManual',
 'AppliedAmount',
 'Amount',
 'Interest',
 'MonthlyPayment',
 'OccupationArea',
 'HomeOwnershipType',
 'IncomeFromPrincipalEmployer',
 'IncomeFromPension',
 'IncomeFromFamilyAllowance',
 'IncomeFromSocialWelfare',
 'IncomeFromLeavePay',
 'IncomeFromChildSupport',
 'IncomeOther',
 'IncomeTotal',
 'LiabilitiesTotal',
 'DebtToIncome',
 'FreeCash',
 'LossGivenDefault',
 'ExpectedReturn',
 'PrincipalOverdueBySchedule',
 'PrincipalPaymentsMade',
 'InterestAndPenaltyPaymentsMade',
 'PrincipalBalance',
 'InterestAndPenaltyBalance',
 'NoOfPreviousLoansBeforeLoan',
 'AmountOfPreviousLoansBeforeLoan',
 'PreviousEarlyRepaymentsCountBeforeLoan',
 'ApplicationSignedHour',
 'ApplicationSignedWeekday',
 'LanguageCode',
 'Age',
 'LoanDuration',
 'UseOfLoan',
 'ExistingLiabilities',
 'RefinanceLiabilities',
 'MonthlyPaymentDay',
 'CreditScoreEsMicroL',
 'LoanWaitTime',
 'PaymentToLoanDateDiff',
 'LoanTerm']

In [10]:
nloc = []

for feature in numerical:
    try:
        idx = X.columns.get_loc(feature)
        nloc.append(idx)
    except ValueError:
        print(f"Column '{feature}' not found in X columns.")


In [11]:
# X=X.values
# y=y.values

## Splitting the dataset into the Training set and Test set

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [13]:
from sklearn.preprocessing import PowerTransformer

# Initialize PowerTransformer
power_transformer = PowerTransformer()

# Fit the transformer on training data and transform training data
X_train.loc[:, X_train.columns[nloc]] = power_transformer.fit_transform(X_train.loc[:, X_train.columns[nloc]])

# Transform testing data using the fitted transformer from training data
X_test.loc[:, X_test.columns[nloc]] = power_transformer.transform(X_test.loc[:, X_test.columns[nloc]])

In [13]:
from interpret import show
from interpret.glassbox import ExplainableBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define the hyperparameter search space
param_dist = {
    'max_rounds': randint(50, 200),
    'learning_rate': uniform(0.001, 0.1),
    'max_bins': randint(50, 255),
    'max_interaction_bins': randint(5, 20),
    'interactions': randint(5, 20),
    'outer_bags': randint(5, 20),
    'inner_bags': randint(0, 10),
    'random_state': randint(0, 100)
}

# Instantiate the EBM regressor
ebm = ExplainableBoostingRegressor()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=ebm,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='neg_mean_squared_error',  # Scoring metric
    cv=3,  # Cross-validation folds
    verbose=2,  # Controls the verbosity
    n_jobs=-1,  # Use all available processors
    random_state=42  # Random seed for reproducibility
)

# Perform hyperparameter tuning
random_search.fit(X_train, y_train)

# Get best parameters and best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Model:", best_model)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters: {'inner_bags': 3, 'interactions': 17, 'learning_rate': 0.09795368671141591, 'max_bins': 155, 'max_interaction_bins': 18, 'max_rounds': 196, 'outer_bags': 5, 'random_state': 62}
Best Model: ExplainableBoostingRegressor(inner_bags=3, interactions=17,
                             learning_rate=0.09795368671141591, max_bins=155,
                             max_interaction_bins=18, max_rounds=196,
                             outer_bags=5, random_state=62)


In [37]:
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show

# Set visualization provider
set_visualize_provider(InlineProvider())

ebr = ExplainableBoostingRegressor(feature_names=df_idx.columns)

# Fit the model
ebr.fit(X_train, y_train)

# Generate global explanation
global_explanation = ebr.explain_global()

# Show the global explanation
show(global_explanation)

In [20]:
y_pred=ebr.predict(X_test)

In [21]:
from sklearn.metrics import r2_score
print(r2_score(y_pred,y_test))

0.9101084197981096


In [22]:
from joblib import dump
dump(ebr,'p2p_ebr_reg.joblib')

['p2p_ebr_reg.joblib']

### ebm metrics and diagrams(feature importance, pdp, etc)

In [54]:
y_pred=ebm.predict(X_test)

In [55]:
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error

# Calculate and print R2 score
r2_score_value = r2_score(y_test, y_pred)
print("R2 Score:", r2_score_value)

# Calculate and print Mean Absolute Error (MAE)
mae_value = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae_value)

# Calculate and print Mean Squared Error (MSE)
mse_value = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse_value)

# Calculate and print Root Mean Squared Error (RMSE)
rmse_value = np.sqrt(mse_value)
print("Root Mean Squared Error (RMSE):", rmse_value)


R2 Score: 0.9168816252279852
Mean Absolute Error (MAE): 0.02380461611292736
Mean Squared Error (MSE): 0.001630333953254437
Root Mean Squared Error (RMSE): 0.04037739408696947


### to explain local instances

In [None]:
# Explain local feature importance for a specific instance
instance_idx = 0  # Change this index to the instance you want to explain
local_explanation = ebm.explain_local(X_test[instance_idx], y_test[instance_idx], name='EBM')

# Replace feature indices with feature names in local explanation
local_explanation.features = {feature_names[i]: val for i, val in local_explanation.features.items()}

# Show local explanation
show(local_explanation)

In [14]:
from joblib import load
ebm=load("C:\\Users\\Sarrang\\p2p\\models\\regression\\p2p_ebr_reg.joblib")

In [40]:
show(ebr.explain_local(X_test[:5], y_test[:5]), 0)

ImportError: cannot import name 'plot_partial_dependence' from 'sklearn.inspection' (C:\Users\Sarrang\anaconda3\Lib\site-packages\sklearn\inspection\__init__.py)