## Importing Libraries

In [171]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric
import cvxpy as cp

## Loading and Transforming the Dataset

In [172]:
# Load the dataset
data = pd.read_csv('D:/AI_Project/credit_score.csv')

# Transform the target variable into a binary variable
target_variable = 'CREDIT_SCORE'
data['target'] = (data[target_variable] >= 700).astype(int)


In [173]:
# Verify the unique values of the target variable
print("Unique values in target variable before preprocessing:", data['target'].unique())

# Assuming 'DEFAULT' is the protected attribute
protected_attribute = 'DEFAULT'
print("Unique values in protected attribute:", data[protected_attribute].unique())


Unique values in target variable before preprocessing: [0 1]
Unique values in protected attribute: [1 0]


## Preprocessing the Data

In [174]:
# Preprocessing
def preprocess_data(df, target):
    # Handling missing values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Encoding categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    # Standardizing numerical features (excluding the target column)
    scaler = StandardScaler()
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    numerical_cols = numerical_cols.drop(target)  # Exclude the target column
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

data = preprocess_data(data, 'target')


In [175]:
# Verify the unique values of the target variable after preprocessing
print("Unique values in target variable after preprocessing:", data['target'].unique())


Unique values in target variable after preprocessing: [0 1]


## Spliting the Data

In [176]:
# Splitting the data
X = data.drop(['target', target_variable], axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the splits
print("Unique values in y_train:", y_train.unique())
print("Unique values in y_test:", y_test.unique())


Unique values in y_train: [0 1]
Unique values in y_test: [0 1]


In [177]:
# Ensure y_train and y_test have binary values
if not set(y_train.unique()).issubset({0, 1}):
    raise ValueError("The target variable y_train contains values other than 0 and 1.")
if not set(y_test.unique()).issubset({0, 1}):
    raise ValueError("The target variable y_test contains values other than 0 and 1.")


In [182]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
roc_score = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("ROC AUC Score:", roc_score)
print("Confusion Matrix:\n", confusion)

ROC AUC Score: 0.5
Confusion Matrix:
 [[199   0]
 [  1   0]]


In [181]:
# Creating AIF360 Datasets for Fairness Metrics Calculation
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)


privileged_groups = [{'DEFAULT': 1}]
unprivileged_groups = [{'DEFAULT': 0}]

dataset_train = BinaryLabelDataset(df=train_data, label_names=['target'], protected_attribute_names=['DEFAULT'])
dataset_test = BinaryLabelDataset(df=test_data, label_names=['target'], protected_attribute_names=['DEFAULT'])

# Calculate fairness metrics before mitigation
metric = ClassificationMetric(dataset_train, dataset_test,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

di_before = metric.disparate_impact()
eod_before = metric.equal_opportunity_difference()
print("Disparate Impact before mitigation:", di_before)
print("Equal Opportunity Difference before mitigation:", eod_before)


Disparate Impact before mitigation: 0.432
Equal Opportunity Difference before mitigation: 0.278


## Creating AIF360 Dataset

In [145]:
# Create aif360 dataset
privileged_groups = [{protected_attribute: 1}]
unprivileged_groups = [{protected_attribute: 0}]
dataset = StandardDataset(df=pd.concat([X_train, y_train], axis=1), 
                          label_name='target', 
                          favorable_classes=[1], 
                          protected_attribute_names=[protected_attribute], 
                          privileged_classes=[[1]])




## Applying Reweighing

In [146]:
# Apply Reweighing for bias mitigation
RW = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
RW.fit(dataset)
dataset_transf = RW.transform(dataset)


  self.w_p_fav = n_fav*n_p / (n*n_p_fav)
  self.w_p_unfav = n_unfav*n_p / (n*n_p_unfav)
  self.w_up_fav = n_fav*n_up / (n*n_up_fav)
  self.w_up_unfav = n_unfav*n_up / (n*n_up_unfav)


## Model Development

### Model development with Logistic Regression

In [147]:
# Model Development: Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
print(f"Baseline Logistic Regression ROC AUC: {roc_auc}")


Baseline Logistic Regression ROC AUC: 0.5


### Model development: DRO Logistic Regression

In [148]:
# Model Development: DRO Logistic Regression
def DRO_Logistic_Regression(X, y, rho=0.1):
    n, d = X.shape
    w = cp.Variable(d)
    t = cp.Variable()
    
    objective = cp.Minimize(t + rho * cp.sum(cp.logistic(-cp.multiply(y, X @ w))))
    constraints = [y[i] * (X[i] @ w) >= 1 - t for i in range(n)]
    
    prob = cp.Problem(objective, constraints)
    prob.solve()
    
    return w.value

# Apply DRO Logistic Regression
w_dro = DRO_Logistic_Regression(X_train.values, y_train.values)
y_pred_dro = np.dot(X_test, w_dro)
roc_auc_dro = roc_auc_score(y_test, y_pred_dro)
print(f"DRO Logistic Regression ROC AUC: {roc_auc_dro}")


DRO Logistic Regression ROC AUC: 0.9698492462311558


## Evaluating Fairness

In [152]:
# Evaluate Fairness
metric = ClassificationMetric(dataset, dataset_transf, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
print(f"Disparate Impact: {metric.disparate_impact()}")
print(f"Equal Opportunity Difference: {metric.equal_opportunity_difference()}")



Disparate Impact: 1.3607146789285698
Equal Opportunity Difference: 0.02720543834287903


## Continous Monitoring

In [149]:
# Continuous Monitoring
def monitor_model(model, X_test, y_test):
    # Implement continuous monitoring to check for bias and performance
    y_pred = model.predict(X_test)
    roc_auc = roc_auc_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return roc_auc, cm

roc_auc, cm = monitor_model(model, X_test, y_test)
print(f"Monitoring ROC AUC: {roc_auc}")
print(f"Confusion Matrix: \n{cm}")


Monitoring ROC AUC: 0.5
Confusion Matrix: 
[[199   0]
 [  1   0]]
