NIJ's Recidivism Challenge

[Codebook](https://nij.ojp.gov/funding/recidivism-forecasting-challenge#19-0)

In [None]:
# Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import seaborn as sns

from pandas.plotting import parallel_coordinates

from sklearn.cluster import \
     (KMeans,
      AgglomerativeClustering)
from scipy.cluster.hierarchy import \
     (dendrogram,
      cut_tree)


# Data Load, Filtering, Adjusting

In [None]:
# Loading Data
url = "https://raw.githubusercontent.com/gringler8/data5322project/main/NIJ_s_Recidivism_Challenge_Full_Dataset_20240520.csv"
data = pd.read_csv(url)
print(data.head())
print(data.shape)

In [None]:
# List of columns to filter NAs
columns_to_check = [
    'Avg_Days_per_DrugTest',
    'DrugTests_THC_Positive',
    'DrugTests_Cocaine_Positive',
    'DrugTests_Meth_Positive',
    'DrugTests_Other_Positive',
    'Percent_Days_Employed',
    'Jobs_Per_Year',
    'Supervision_Risk_Score_First'
]

# Filter out rows with NAs in the specified columns
filtered_data = data.dropna(subset=columns_to_check).copy()

# Convert Supervision_Risk_Score_First to a categorical variable
filtered_data.loc[:, 'Supervision_Risk_Score_First'] = filtered_data['Supervision_Risk_Score_First'].astype('category')

# Clean and convert Avg_Days_per_DrugTest to float by removing commas
filtered_data.loc[:, 'Avg_Days_per_DrugTest'] = filtered_data['Avg_Days_per_DrugTest'].str.replace(',', '').astype(float)
filtered_data.loc[:, 'ID'] = filtered_data['ID'].str.replace(',', '').astype(float)

# Create Numeric Columns for "Or More" columns to cap at the number prior to "Or More"
column_conditions = {
    'Dependents': (3, '3 or more'),
    'Prior_Arrest_Episodes_Felony': (10, '10 or more'),
    'Prior_Arrest_Episodes_Misd': (6, '6 or more'),
    'Prior_Arrest_Episodes_Violent': (3, '3 or more'),
    'Prior_Arrest_Episodes_Property': (5, '5 or more'),
    'Prior_Arrest_Episodes_Drug': (5, '5 or more'),
    'Prior_Arrest_Episodes_PPViolationCharges': (5, '5 or more'),
    'Prior_Conviction_Episodes_Felony': (3, '3 or more'),
    'Prior_Conviction_Episodes_Misd': (4, '4 or more'),
    'Prior_Conviction_Episodes_Prop': (3, '3 or more'),
    'Prior_Conviction_Episodes_Drug': (2, '2 or more'),
    'Delinquency_Reports': (4, '4 or more'),
    'Program_Attendances': (10, '10 or more'),
    'Program_UnexcusedAbsences': (3, '3 or more'),
    'Residence_Changes': (3, '3 or more')
}

# Create the numeric columns based on the conditions
for column, (numeric_value, condition_str) in column_conditions.items():
    numeric_column = column + "_Numeric"
    filtered_data.loc[:, numeric_column] = filtered_data[column].replace(condition_str, numeric_value).astype(int)

print(filtered_data.shape)


# PCA

In [None]:
# Selecting the columns for PCA
pca_data = filtered_data[[
    'Avg_Days_per_DrugTest',
    'DrugTests_THC_Positive',
    'DrugTests_Cocaine_Positive',
    'DrugTests_Meth_Positive',
    'DrugTests_Other_Positive',
    'Percent_Days_Employed',
    'Jobs_Per_Year',
]]

# Standardize the data by centering and scaling
scaler = StandardScaler()
pca_data_scaled = scaler.fit_transform(pca_data)

# Perform PCA
pca = PCA()
pca_out = pca.fit_transform(pca_data_scaled)

# Create a DataFrame to display the mean and scale used in the standardization
scaling_info = pd.DataFrame({'Center': scaler.mean_, 'Scale': scaler.scale_}, index=pca_data.columns)
print(scaling_info)

# Print the number of principal components
print("Number of Principal Components:", pca.n_components_)

# Explained variance
explained_variance = pca.explained_variance_ratio_
print("Explained Variance:", explained_variance)

# Plot the principal components
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Select a representative data point for each Supervision_Risk_Score_First
representative_points = filtered_data.groupby('Supervision_Risk_Score_First')[pca_data.columns].mean()

# Standardize the representative points
representative_points_scaled = scaler.transform(representative_points)
representative_pca_out = pca.transform(representative_points_scaled)

# Plotting the PCA biplot with Supervision Risk Score
i, j = 0, 1  # which components to plot
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

# Plot the scores for the first two principal components
ax.scatter(representative_pca_out[:, i], representative_pca_out[:, j], edgecolor='k', s=100)

# Annotate points with Supervision Risk Score
for score, pc1, pc2 in zip(representative_points.index, representative_pca_out[:, i], representative_pca_out[:, j]):
    ax.annotate(str(int(score)), (pc1, pc2))

# Plot principal component loading vectors
for k in range(pca.components_.shape[1]):
    ax.arrow(0, 0, pca.components_[i, k], pca.components_[j, k], head_width=0.03, head_length=0.05, ec='gray')
    ax.text(pca.components_[i, k] * 1.1, pca.components_[j, k] * 1.1, pca_data.columns[k], color='r')

ax.set_xlabel('PC%d' % (i + 1))
ax.set_ylabel('PC%d' % (j + 1))
ax.set_title("PCA Biplot with Supervision Risk Score")
plt.show()

# **PCA/K-Means Clustering**


In [None]:
pca = PCA(n_components=4)
pca_out = pca.fit_transform(pca_data_scaled)

In [None]:
pca_out = pca.fit_transform(pca_data_scaled)

In [None]:
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(pca_out)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 15), wcss, marker = 'o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
pca_kmeans = KMeans(n_clusters=11, init='k-means++', random_state=100)
pca_kmeans.fit(pca_out)

In [None]:
# Add the cluster labels for new dataframe
pca_kmeans_data = pca_data.copy()
pca_kmeans_data['Cluster'] = pca_kmeans.labels_
# Add Component to dataframe
pca_kmeans_data = pd.concat([pca_kmeans_data, pd.DataFrame(pca_out)], axis=1)
pca_kmeans_data = pca_kmeans_data.rename(columns={0: 'PC1', 1: 'PC2', 2: 'PC3', 3: 'PC4'})

In [None]:
x_feature = 'Jobs_Per_Year'
y_feature = 'DrugTests_Other_Positive'

plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_kmeans_data, x=x_feature, y=y_feature, hue='Cluster', palette='Paired')
plt.title('KMeans Clustering')
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.show()

In [None]:
x_feature = 'Percent_Days_Employed'
y_feature = 'DrugTests_Other_Positive'

plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_kmeans_data, x=x_feature, y=y_feature, hue='Cluster', palette='Paired')
plt.title('KMeans Clustering')
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.show()

In [None]:
pd.set_option('display.max_columns', None)

pca_summary = pca_kmeans_data.drop(['PC1', 'PC2', 'PC3', 'PC4'], axis=1)
cluster_count = pca_summary.groupby('Cluster').size()
summary = pca_summary.groupby('Cluster').agg(['mean', 'std'])
summary = summary.reset_index()  # Reset the index
summary = pd.concat([summary, cluster_count.to_frame(name='count')], axis=1)

# Renaming the columns for clarity
summary.columns = ['_'.join(col) for col in summary.columns]
summary

In [None]:
summary.to_csv('summary.csv', index=False)