In [1]:
!pip install -q joblib scikit-learn pandas numpy tqdm

In [2]:
from google.colab import drive; drive.mount('/content/drive')
import os, numpy as np, pandas as pd, joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

Mounted at /content/drive


In [3]:
DATA_PATH="/content/drive/MyDrive/datasets/compas.csv"
TARGET="two_year_recid"; POS_LABEL=1; SENSITIVE="race"; PRIVILEGED="Caucasian"
OUTPUT_DIR="/content/drive/MyDrive/model_outputs/compas"
os.makedirs(OUTPUT_DIR,exist_ok=True)

In [4]:
df=pd.read_csv(DATA_PATH)

In [5]:
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [6]:
y=(df[TARGET]==POS_LABEL).astype(int)
X=df.drop(columns=[TARGET])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [8]:
num_cols=X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols=[c for c in X.columns if c not in num_cols]
num=Pipeline([('imputer',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
cat=Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])
preproc=ColumnTransformer([('num',num,num_cols),('cat',cat,cat_cols)])

In [9]:
models={"lr":LogisticRegression(max_iter=500,solver='liblinear'),
        "rf":RandomForestClassifier(n_estimators=100,random_state=42),
        "mlp":MLPClassifier(hidden_layer_sizes=(64,32),max_iter=300,random_state=42)}
logits_from_proba=lambda p: np.log(np.clip(p,1e-9,1-1e-9)/(1-np.clip(p,1e-9,1-1e-9)))

In [10]:
metrics=[]
for n,m in models.items():
    pipe=Pipeline([('preproc',preproc),('clf',m)])
    pipe.fit(X_train,y_train)

    y_pred=pipe.predict(X_test)
    proba=pipe.predict_proba(X_test)[:,1]
    logits=logits_from_proba(proba)

    # Calculate Metrics
    acc,f1,auc=accuracy_score(y_test,y_pred),f1_score(y_test,y_pred),roc_auc_score(y_test,proba)

    # Save Predictions and Model
    pd.DataFrame({"y_true":y_test,"y_pred":y_pred,"proba":proba,"logit":logits,SENSITIVE:X_test[SENSITIVE].values}).to_csv(f"{OUTPUT_DIR}/{n}_preds.csv",index=False)
    joblib.dump(pipe,f"{OUTPUT_DIR}/{n}.joblib")
    metrics.append([n,acc,f1,auc])



In [11]:
metrics_df = pd.DataFrame(metrics, columns=["model", "accuracy", "f1_score", "roc_auc_score"])
display(metrics_df)

Unnamed: 0,model,accuracy,f1_score,roc_auc_score
0,lr,0.981982,0.980273,0.998339
1,rf,0.97228,0.970149,0.997276
2,mlp,0.980596,0.978852,0.996355


In [12]:
pd.DataFrame(metrics,columns=["model","accuracy","f1","auc"]).to_csv(f"{OUTPUT_DIR}/metrics_summary.csv",index=False)
print("✅ COMPAS models done. Saved in",OUTPUT_DIR)

✅ COMPAS models done. Saved in /content/drive/MyDrive/model_outputs/compas
