In [25]:
print(__doc__)
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import check_random_state
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import mutual_info_score

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

# from ace import model
# from ace import ace
import ace.model
import ace.ace

pd.options.display.float_format = "{:.1f}".format

Automatically created module for IPython interactive environment


In [26]:
def retrieve_data_recid():

    """ This function is used for retrieving dataset COMPAS and split data entries into labeled (training, testing) and unlabeled data (validation) 
    Prediction task is to determine whether a person will recidive after first prosecution"""

    """ Binary classification
    =================  ======================
    samples total      5875
    Dimensionality     9(Features)+1(Bias)
    Features           real
    Classes            2
    =================  ======================

    Source
    ----------
    How We Analyzed the COMPAS Recidivism Algorithm, by Jeff Larson, Surya Mattu, Lauren Kirchner and Julia Angwin, May 23, 2016
    https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm
    https://github.com/propublica/compas-analysis
    
    Parameters
    ----------
    none

    Returns
    -------
    X_labeled: training data, ndarray, shape (10, 9)
    y_labeled: training target, ndarray, shape (10, ) 
    b_labeled: bias attribute of training set, ndarray, shape (10, )

    X_unlabeled: sample pool data, ndarray, shape (4397, 9)
    y_unlabeled: sample pool target, ndarray, shape (4397, )
    b_unlabeled: bias attribute of sample pool set, ndarray, shape (4397, )

    X_test: testing data, ndarray, shape (1468, 9)
    y_test: testing target, ndarray, shape (1468, )
    b_test: bias attribute of testing set, ndarray, shape (1468, )
    """
    
    mc_attributes = ['MarriageStatus','age','juv_fel_count', 'juv_misd_count', 'juv_other_count','priors_count', 'days_b_screening_arrest','c_days_from_compas','c_charge_degree','race']
    attributes = ['MarriageStatus','age','juv_fel_count', 'juv_misd_count', 'juv_other_count','priors_count', 'days_b_screening_arrest','c_days_from_compas','c_charge_degree']
    bias = 'race'
    target = 'two_year_recid'

    # np.random.seed(42)
    data = pd.read_csv("https://raw.githubusercontent.com/WenxuanHuang/Active-Learning-Performance-Benchmarking/main/RecidivismData_Normalized.csv", sep=',')
    data_col = data.columns
    df = data[(data[bias]==2)|(data[bias]==3)].copy()
    # print(df.shape)


    df_X = df[attributes]
    df_y = df[target]
    df_X = df_X.values
    df_y = df_y.values.astype('int64')
    df_b = df[bias].values-2
    
    return (df_X, df_y, df_b)

In [27]:
def maximal_correlation_transform(ace_model, x, y):
    
    ace_model.build_model_from_xy(x, y)
    X_transformed = ace_model.ace.x_transforms
    y_transformed = ace_model.ace.y_transform
    
    return X_transformed, y_transformed

In [28]:


(X,y,b) = retrieve_data_recid()



In [29]:
Xb_ace = ace.model.Model()
Xy_ace = ace.model.Model()

X_Xb_Mc, b_Xb_Mc = maximal_correlation_transform(Xb_ace, X.T.tolist(), b.tolist())
X_Xy_Mc, y_Xy_Mc = maximal_correlation_transform(Xy_ace, X.T.tolist(), y.tolist())

* Starting outer iteration 000. Current err =  1.00000E+00
  Starting inner iteration 000. Current err =  1.00000E+00


  (xi - self._mean_x_in_window) ** 2 /


  Starting inner iteration 001. Current err =  8.73472E-01
  Starting inner iteration 002. Current err =  8.64213E-01
  Starting inner iteration 003. Current err =  8.63233E-01
  Starting inner iteration 004. Current err =  8.63047E-01
  Starting inner iteration 005. Current err =  8.63005E-01
  Starting inner iteration 006. Current err =  8.62982E-01
* Starting outer iteration 001. Current err =  8.57015E-01
  Starting inner iteration 000. Current err =  8.57015E-01
  Starting inner iteration 001. Current err =  8.18797E-01
  Starting inner iteration 002. Current err =  8.18692E-01
* Starting outer iteration 002. Current err =  7.92400E-01
  Starting inner iteration 000. Current err =  7.92400E-01
  Starting inner iteration 001. Current err =  7.24373E-01
* Starting outer iteration 003. Current err =  7.17737E-01
  Starting inner iteration 000. Current err =  7.17737E-01
  Starting inner iteration 001. Current err =  7.08043E-01
* Starting outer iteration 004. Current err =  7.12135E-

In [30]:
X_Xy_res = pd.DataFrame(data=np.array(X_Xy_Mc).T)
X_Xy_res.to_csv('X_Xy_Mc_1117.csv',sep=',',index=False)

y_Xy_res = pd.DataFrame(data=np.array(y_Xy_Mc).T)
y_Xy_res.to_csv('y_Xy_Mc_1117.csv',sep=',',index=False)

X_Xb_res = pd.DataFrame(data=np.array(X_Xb_Mc).T)
X_Xb_res.to_csv('X_Xb_Mc_1117.csv',sep=',',index=False)

b_Xb_res = pd.DataFrame(data=np.array(b_Xb_Mc).T)
b_Xb_res.to_csv('b_Xb_Mc_1117.csv',sep=',',index=False)