imports

In [2]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Check data

In [3]:
data = pd.read_csv('./data/clean_data.csv')

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
import plotly.express as px
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [17]:
data.dropna(subset=['dob'], inplace=True)

data['c_jail_in'] = pd.to_datetime(data['c_jail_in'], format='%d/%m/%Y %H:%M', dayfirst=True)
data['c_jail_out'] = pd.to_datetime(data['c_jail_out'], format='%d/%m/%Y %H:%M', dayfirst=True)
data['dob'] = pd.to_datetime(data['dob'], format='%d/%m/%Y', dayfirst=True)
data['duration'] = abs((data['c_jail_out'] - data['c_jail_in']).dt.days) + 1
data['duration'] = data['duration'].fillna(0)
data['duration'] = data['duration'].astype('int')
df = data[data['is_recid'] != -1]
df_glob = df.drop(['id', 'sex', 'race', 'c_charge_desc', 'c_charge_degree', 'age_cat', 'name', 'c_jail_in', 'c_jail_out', 'age', 'dob', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'days_b_screening_arrest', 'c_days_from_compas', 'r_offense_date', 'r_charge_desc', 'r_jail_in','vr_offense_date','vr_charge_desc','score_text','screening_date','v_decile_score','v_score_text','priors_count.1', 'r_charge_degree', 'r_days_from_arrest', 'is_violent_recid', 'vr_charge_degree'], axis=1) 
print(df_glob.columns)

Index(['decile_score', 'priors_count', 'is_recid', 'event', 'duration',
       'cluster'],
      dtype='object')


In [18]:
sex_encoded = pd.get_dummies(data['sex'], prefix='sex')
age_encoded = pd.get_dummies(data['age_cat'], prefix='age_cat')
race_encoded = pd.get_dummies(data['race'], prefix='race')
c_degree_encoded = pd.get_dummies(data['c_charge_degree'], prefix='c_degree')
c_charge_desc_encoded = pd.get_dummies(data['c_charge_desc'], prefix='c_charge_desc')

# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_glob, sex_encoded, race_encoded, c_degree_encoded, c_charge_desc_encoded], axis=1)
data.dropna(subset=['c_jail_in', 'c_jail_out'], inplace=True)

# Convert datetime objects to timestamps in seconds
df_encoded['dob'] = data['dob'].apply(lambda x: x.timestamp())
df_encoded['duration'] = abs((data['c_jail_out'] - data['c_jail_in']).dt.days)
df_encoded.dropna(inplace=True)
# df_encoded['c_jail_in'] = data['c_jail_in'].apply(lambda x: x.timestamp())
# df_encoded['c_jail_out'] = data['c_jail_out'].apply(lambda x: x.timestamp())
# df_encoded['r_offense_date'] = data['r_offense_date'].apply(lambda x: x.timestamp())
print(df_encoded.columns)
X = df_encoded.drop('is_recid', axis=1)
# df_encoded
X

Index(['decile_score', 'priors_count', 'is_recid', 'event', 'duration',
       'cluster', 'sex_Female', 'sex_Male', 'race_African-American',
       'race_Asian',
       ...
       'c_charge_desc_Viol Injunction Protect Dom Vi',
       'c_charge_desc_Viol Pretrial Release Dom Viol',
       'c_charge_desc_Viol Prot Injunc Repeat Viol',
       'c_charge_desc_Violation License Restrictions',
       'c_charge_desc_Violation Of Boater Safety Id',
       'c_charge_desc_Violation of Injunction Order/Stalking/Cyberstalking',
       'c_charge_desc_Voyeurism', 'c_charge_desc_Workers Compensation Fraud',
       'c_charge_desc_arrest case no charge', 'dob'],
      dtype='object', length=517)


Unnamed: 0,decile_score,priors_count,event,duration,cluster,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,...,c_charge_desc_Viol Injunction Protect Dom Vi,c_charge_desc_Viol Pretrial Release Dom Viol,c_charge_desc_Viol Prot Injunc Repeat Viol,c_charge_desc_Violation License Restrictions,c_charge_desc_Violation Of Boater Safety Id,c_charge_desc_Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc_Voyeurism,c_charge_desc_Workers Compensation Fraud,c_charge_desc_arrest case no charge,dob
0,1.0,0.0,0.0,0,0.0,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,-716601600.0
1,1.0,0.0,0.0,0,0.0,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,-716601600.0
3,3.0,0.0,1.0,10,0.0,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,380505600.0
4,4.0,4.0,0.0,1,1.0,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,674179200.0
5,4.0,4.0,0.0,1,1.0,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,674179200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15508,3.0,0.0,0.0,1,0.0,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,733017600.0
15509,6.0,5.0,0.0,10,0.0,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,472435200.0
15510,1.0,0.0,0.0,1,1.0,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,-355104000.0
15511,2.0,3.0,0.0,1,0.0,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,406339200.0


In [19]:
def get_accuracy(conf_matrix):
    return (conf_matrix[0][0] + conf_matrix[1][1]) / (conf_matrix[0].sum() + conf_matrix[1].sum())

In [20]:
preprocessor = StandardScaler()
# preprocessor = MinMaxScaler()
reductor = PCA(n_components=2)
algo = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(14, 14, 14, 14, 14, 14, 14, 14, 14, 14), random_state=1, max_iter=500)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reductor', reductor),
    ('algo', algo)
])

pipeline.fit(X, df_encoded['is_recid'])

cluster_labels = pipeline.predict(X)
cluster_series = pd.Series(cluster_labels, index=X.index)

data['cluster'] = cluster_series

y_true = df_encoded['is_recid']

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_true, cluster_labels)

print("Confusion Matrix:")
print(conf_matrix)

print(f"accuracy : {get_accuracy(conf_matrix)}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Confusion Matrix:
[[5859 2005]
 [3067 3376]]
accuracy : 0.6454882225484029


In [21]:
def get_optimized_table(neuron_max, layer_max):
    res = []
    for layer in range(1, layer_max + 1):
        for neuron in range(1, neuron_max + 1):
            res.append(tuple([neuron for _ in range(layer)]))
    return res

In [22]:
def optimize(X, Y, neuron_max, layer_max):
    best_config = [(0,), 0]
    for n in get_optimized_table(neuron_max, layer_max):
        for _ in range(5):
            preprocessor = StandardScaler()
            reductor = PCA(n_components=2)
            algo = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15), random_state=1, max_iter=500)

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('reductor', reductor),
                ('algo', algo)
            ])

            pipeline.fit(X, Y)
            cluster_labels = pipeline.predict(X)
            cluster_series = pd.Series(cluster_labels, index=X.index)

            data['cluster'] = cluster_series

            # Compute the confusion matrix
            conf_matrix = confusion_matrix(Y, cluster_labels)
            if get_accuracy(conf_matrix) > best_config[1]:
                print(n)
                best_config = [n, get_accuracy(conf_matrix)]
    return best_config

In [11]:
print(optimize(X, df_encoded['is_recid'], 15, 10))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


(1,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


(1,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(2,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(5,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(11,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(14,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(3, 3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

(13, 13)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

[(13, 13), 0.6481442650450828]


In [23]:
best_config = (13, 13)

In [35]:
preprocessor = StandardScaler()
# preprocessor = MinMaxScaler()
reductor = PCA(n_components=2)
algo = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=best_config, random_state=1, max_iter=500)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reductor', reductor),
    ('algo', algo)
])

X_train, X_test, y_train, y_test = train_test_split(X, df_encoded['is_recid'], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred_tot = pipeline.predict(X)

cluster_series = pd.Series(y_pred_tot, index=X.index)

data['prediction'] = cluster_series

data.to_csv("data/prediction_with_neural.csv")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


              precision    recall  f1-score   support

         0.0       0.66      0.74      0.70      1585
         1.0       0.62      0.53      0.57      1277

    accuracy                           0.65      2862
   macro avg       0.64      0.63      0.63      2862
weighted avg       0.64      0.65      0.64      2862

