In [2]:
# Important, set your home directory here:
home_dir = ''

In [3]:
import os
os.chdir(home_dir)

import pandas as pd
import numpy as np
import pickle

# Light preprocessing and fitting
import lightgbm as lgb

In [4]:
%load_ext autoreload
%autoreload 2
from main.models.utils import cv_early_stopping
from main.fairness.paramfitter import LocationScaleEMDW1
from main.loaders.loader_compas import prepare_compas, load_compas
from main.evaluation.evaluator import get_metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
for seed_ in [42, 1029, 3948, 103, 56, 93983838, 828, 1928838, 900, 10]:
    drop_features = ['two_year_recid',
                    'event',
                    'id',
                    'name',
                    'first',
                    'last', 
                    'compas_screening_date',
                    'dob',
                    'c_jail_in',
                    'c_jail_out', 
                    'c_case_number',
                    'age', # Exclude age to enable categoricals
                    'priors_count',
                    'priors_count.1',
                    'screening_date', 
                    'v_type_of_assessment',
                    'v_screening_date',
                    'in_custody', 
                    'out_custody',
                    'start', 
                    'end',
                    'decile_score', # Drop the scoes as well
                    'decile_score.1', 
                    'score_text',
                    'v_decile_score',
                    'target_high', 
                    ]
    
    X_train, X_test, y_train, y_test, transformer = prepare_compas(drop_list=drop_features,
                                                                   target='is_violent_recid', 
                                                                   seed=seed_)
    
    X_train_orig = X_train.copy()
    X_test_orig = X_test.copy()


    # Drop the age indices
    X_train = np.delete(X_train, 2, axis=1)
    X_train = np.delete(X_train, 1, axis=1)
    X_train = np.delete(X_train, 0, axis=1)

    X_test = np.delete(X_test, 2, axis=1)
    X_test = np.delete(X_test, 1, axis=1)
    X_test = np.delete(X_test, 0, axis=1)

    params = {
        "objective": "binary",
        "metric": "auc",
        "min_data_in_leaf": 20,
        "learning_rate": 0.01,
        "verbose": -1
    }

    cv_results = cv_early_stopping(params=params, 
                                nfolds=3, 
                                max_rounds=1000, 
                                early_stopping_rounds=50, 
                                X_train=X_train, 
                                y_train=y_train, 
                                objective='classification')
    
    best_res = np.argmax(cv_results['metric'])
    best_iter = cv_results['iterations'][best_res]

    # Re-Train on whole train dataset
    data_train_all = lgb.Dataset(data=X_train, 
                                label=y_train)

    best_estimator = lgb.train(params=params,
                            train_set=data_train_all, 
                            num_boost_round=best_iter)


    preds_uncorrected_calib = best_estimator.predict(X_train)
    preds_uncorrected_test = best_estimator.predict(X_test)

    sens_observed_calib = np.where(X_train_orig[:, 2] > 0, 1, 0)
    sens_observed_test = np.where(X_test_orig[:, 2] > 0,1,0) 
    sens_unobserved_test = np.where(X_test_orig[:,1] > 0,1,0) 

    # Use a beta model
    sampler_model = LocationScaleEMDW1()

    metrics, nonparam, param, sampler = get_metrics(sampler_model,
                                                preds_uncorrected_calib,
                                                preds_uncorrected_test,
                                                sens_observed_calib,
                                                sens_observed_test,
                                                sens_unobserved_test, 
                                                y_test, 
                                                mc_len=10, 
                                                min_glob=0, 
                                                cu_=0.12)

    # Save everything
    with open(f'data/results/compas/dicts/1res_dict_{seed_}.pkl', 'wb') as con_:
        pickle.dump(metrics, con_)

    with open(f'data/results/compas/params/1param_model_{seed_}.pkl', 'wb') as con_:
        pickle.dump(sampler, con_)

    best_estimator.save_model(f'data/results/compas/models/1model_seed_{seed_}.txt', 
                              num_iteration = best_iter)
