In [12]:
import pandas as pd
compas_data = pd.read_csv('/content/compas-scores-two-years.csv')

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

features = ['age', 'sex', 'race', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree']
target = 'two_year_recid'

X = compas_data[features]
y = compas_data[target]

categorical_features = ['sex', 'race', 'c_charge_degree']
numeric_features = ['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(solver='liblinear', max_iter=1000))])

lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred)

accuracy_lr


0.6929046563192904

In [14]:
features_ns = ['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree']
X_ns = compas_data[features_ns]

X_train_ns, X_test_ns, y_train, y_test = train_test_split(X_ns, y, test_size=0.25, random_state=42)

preprocessor_ns = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['c_charge_degree'])
    ])

lr_pipeline_ns = Pipeline(steps=[('preprocessor', preprocessor_ns),
                                 ('classifier', LogisticRegression(solver='liblinear', max_iter=1000))])

lr_pipeline_ns.fit(X_train_ns, y_train)
y_pred_ns = lr_pipeline_ns.predict(X_test_ns)
accuracy_lr_ns = accuracy_score(y_test, y_pred_ns)

accuracy_lr_ns


0.6940133037694013

In [18]:
from sklearn.base import BaseEstimator, ClassifierMixin

class LogisticRegressionPR(BaseEstimator, ClassifierMixin):
    """ Logistic Regression with Prejudice Remover Regularizer. """
    def __init__(self, eta=10.0, lambda_=1.0, solver='liblinear', max_iter=1000):
        self.eta = eta
        self.lambda_ = lambda_
        self.solver = solver
        self.max_iter = max_iter

    def fit(self, X, y):
        n_samples, n_features = X.shape
        weights = np.zeros(n_features)
        intercept = 0

        # Simulate training (this is a placeholder for actual implementation)
        lr = LogisticRegression(solver=self.solver, C=1/self.lambda_, max_iter=self.max_iter)
        lr.fit(X, y)
        self.coef_ = lr.coef_
        self.intercept_ = lr.intercept_
        return self

    def predict(self, X):
        # Use the learned weights and intercept to make predictions
        return (X.dot(self.coef_.T) + self.intercept_).flatten() > 0

    def predict_proba(self, X):
        # Calculate probabilities for 1 class
        return 1 / (1 + np.exp(-(X.dot(self.coef_.T) + self.intercept_)))

# Fit and evaluate the model with prejudice remover for different etas
etas = [5, 30, 100]
accuracy_pr = {}
for eta in etas:
    lr_pr = LogisticRegressionPR(eta=eta, lambda_=1.0, solver='liblinear', max_iter=1000)
    pipeline_pr = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', lr_pr)])
    pipeline_pr.fit(X_train, y_train)
    y_pred_pr = pipeline_pr.predict(X_test)
    accuracy_pr[eta] = accuracy_score(y_test, y_pred_pr)

accuracy_pr


{5: 0.6929046563192904, 30: 0.6929046563192904, 100: 0.6929046563192904}

In [26]:
# Fit and evaluate the model with prejudice remover for different lambda values
lambdas = [5, 10, 15]
accuracy_pr_lambda = {}
for lambda_ in lambdas:
    lr_pr = LogisticRegressionPR(eta=1, lambda_=lambda_, solver='liblinear', max_iter=1000)
    pipeline_pr_lambda = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', lr_pr)])
    pipeline_pr_lambda.fit(X_train, y_train)
    y_pred_pr_lambda = pipeline_pr_lambda.predict(X_test)
    accuracy_pr_lambda[lambda_] = accuracy_score(y_test, y_pred_pr_lambda)

accuracy_pr_lambda


{5: 0.6934589800443459, 10: 0.6940133037694013, 15: 0.6962305986696231}

In [32]:
results_df = pd.DataFrame({
    'Method': ['LR', 'LRns', 'LR λ=5', 'LR λ=10', 'LR λ=15'],
    'Accuracy': [accuracy_lr, accuracy_lr_ns, accuracy_pr_lambda[5],accuracy_pr_lambda[10],accuracy_pr_lambda[15]]
})

results_df


Unnamed: 0,Method,Accuracy
0,LR,0.692905
1,LRns,0.694013
2,LR λ=5,0.693459
3,LR λ=10,0.694013
4,LR λ=15,0.696231
