In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('NIJ_s_Recidivism_Challenge_Training_Dataset.csv')
data = data.rename(columns={'_v1':'Prior_Arrest_Episodes_PPViolationCharges', 
                            '_v2':'Prior_Conviction_Episodes_PPViolationCharges', 
                            '_v3':'Prior_Conviction_Episodes_DVCharges', 
                            '_v4':'Prior_Conviction_Episodes_GunCharges'})

In [3]:
X_col = data.columns[1: 33]
labels_col = data.columns[-4:]
data = data.dropna(subset=labels_col).dropna(subset=X_col)
X_data, label = data[X_col], data[labels_col]
# dummy encode
X_data = pd.get_dummies(X_data)

In [4]:
# 拆分数据集
X, X_test, y, y_test= train_test_split(X_data, label, test_size=0.25, random_state=0)
print(len(X), len(y), len(X_test), len(y_test))

9549 9549 3183 3183


In [44]:
# Recidivism_Within_3years, Recidivism_Arrest_Year1, Recidivism_Arrest_Year2, Recidivism_Arrest_Year3
y['Recidivism_Within_3years']

9284     False
16876     True
2213     False
1134      True
13982     True
         ...  
6799     False
4582     False
13512     True
14852     True
3822     False
Name: Recidivism_Within_3years, Length: 9549, dtype: bool

# 搞一些feature

In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12732 entries, 0 to 18016
Data columns (total 53 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   ID                                            12732 non-null  object 
 1   Gender                                        12732 non-null  object 
 2   Race                                          12732 non-null  object 
 3   Age_at_Release                                12732 non-null  object 
 4   Residence_PUMA                                12732 non-null  int64  
 5   Gang_Affiliated                               12732 non-null  object 
 6   Supervision_Risk_Score_First                  12732 non-null  float64
 7   Supervision_Level_First                       12732 non-null  object 
 8   Education_Level                               12732 non-null  object 
 9   Dependents                                    12732 non-null 

# logistic regression

## balanced

In [56]:
def lr(Y):
    clf = LogisticRegression(random_state=0, C=0.001, class_weight='balanced')
    clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

In [57]:
for Y in labels_col:
    lr(Y)

Recidivism_Within_3years :  0.6711639848042428
Recidivism_Arrest_Year1 :  0.6517526237166088
Recidivism_Arrest_Year2 :  0.53302514442493
Recidivism_Arrest_Year3 :  0.5145076975738803


## imbalanced

In [7]:
def lr1(Y):
    clf = LogisticRegression(random_state=0)
    clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

for Y in labels_col:
    lr1(Y)

Recidivism_Within_3years :  0.6623351131945656
Recidivism_Arrest_Year1 :  0.5678953604068895
Recidivism_Arrest_Year2 :  0.5
Recidivism_Arrest_Year3 :  0.5


## tuning

In [53]:
param_grid = {'C': list(np.logspace(-3, 3, num=7, base=10))}
gs = GridSearchCV(LogisticRegression(random_state=0, class_weight='balanced'), 
                  param_grid, cv=5, scoring='roc_auc', return_train_score=True)
gs.fit(X, y['Recidivism_Arrest_Year2'])

GridSearchCV(cv=5,
             estimator=LogisticRegression(class_weight='balanced',
                                          random_state=0),
             param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
             return_train_score=True, scoring='roc_auc')

In [55]:
gs.best_score_
gs.best_estimator_

LogisticRegression(C=0.001, class_weight='balanced', random_state=0)

# SVM

## balanced

In [8]:
def mysvm(Y):
    clf = SVC(class_weight='balanced')
    clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

for Y in labels_col:
    mysvm(Y)

## imbalanced

In [10]:
def mysvm1(Y):
    clf = SVC()
    clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

In [11]:
for Y in labels_col:
    mysvm1(Y)

Recidivism_Within_3years :  0.6415335337048875
Recidivism_Arrest_Year1 :  0.5
Recidivism_Arrest_Year2 :  0.5
Recidivism_Arrest_Year3 :  0.5


## tuning

In [52]:
param_grid = {'kernel':('linear', 'rbf'),
              'C': list(np.logspace(-3, 3, num=7, base=10))}
gs = GridSearchCV(SVC(class_weight='balanced'), param_grid, cv=5, scoring='roc_auc', return_train_score=True)
gs.fit(X, y['Recidivism_Arrest_Year2'])
gs.best_score_

KeyboardInterrupt: 

In [31]:
gs.best_estimator_

RandomForestClassifier(max_depth=30, min_samples_leaf=150, n_estimators=50)

# random forest

## balanced

In [49]:
def rf(Y):
    clf = RandomForestClassifier(n_estimators=50, min_samples_leaf=200, class_weight='balanced')
    clf = clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)
    
for Y in labels_col:
    rf(Y)

Recidivism_Within_3years :  0.6620022122073023
Recidivism_Arrest_Year1 :  0.6480254685547123
Recidivism_Arrest_Year2 :  0.5252724746063416
Recidivism_Arrest_Year3 :  0.5312826190767982


## imbalanced

In [48]:
def rf1(Y):
    clf = RandomForestClassifier(n_estimators=70, min_samples_leaf=200)
    clf = clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

for Y in labels_col:
    rf1(Y)

Recidivism_Within_3years :  0.6284084287731796
Recidivism_Arrest_Year1 :  0.5
Recidivism_Arrest_Year2 :  0.5
Recidivism_Arrest_Year3 :  0.5


## tuning

In [38]:
param_grid = {'n_estimators': [50, 60, 70, 80, 90], 
              'min_samples_leaf': [100, 150, 200],
              'max_depth': [10, 20, 30, 40]}
gs = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='roc_auc', return_train_score=True)
gs.fit(X, y['Recidivism_Arrest_Year2'])
gs.best_score_

0.5809396335205677

In [41]:
gs.best_estimator_

RandomForestClassifier(max_depth=30, min_samples_leaf=200, n_estimators=70)

# NN

In [16]:
def nn(Y):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(5, 2), random_state=1)

    clf.fit(X, y[Y])
    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test[Y], pred)
    print(Y, ': ', auc)

In [17]:
for Y in labels_col:
    nn(Y)

Recidivism_Within_3years :  0.6578849800341707
Recidivism_Arrest_Year1 :  0.5715621924537816
Recidivism_Arrest_Year2 :  0.5
Recidivism_Arrest_Year3 :  0.5


## tuning

In [None]:
param_grid = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 10]}
gs = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', return_train_score=True)
gs.fit(X, y['Recidivism_Arrest_Year2'])

# 无聊

In [87]:
#############################################

In [84]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X, y[Y])
gpc.score(X, y[Y])

0.8985234055922086

In [85]:
pred = gpc.predict(X_test)

In [86]:
roc_auc_score(y_test[Y], pred)

0.5

In [91]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y[Y])

KNeighborsClassifier(n_neighbors=3)

In [92]:
pred = neigh.predict(X_test)
roc_auc_score(y_test[Y], pred)

0.5012054531700988

In [93]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y[Y])

pred = clf.predict(X_test)
roc_auc_score(y_test[Y], pred)

0.5