## Imports, Data Loading, Filtering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV



In [None]:
# Loading Data
url = "https://raw.githubusercontent.com/gringler8/data5322project/main/NIJ_s_Recidivism_Challenge_Full_Dataset_20240520.csv"
data = pd.read_csv(url)
print(data.shape)

(25835, 54)


### General Filtering and cleaning:

In [None]:
# Replace NA/blanks in Gang_Affiliated with "Unknown"
data['Gang_Affiliated'] = data['Gang_Affiliated'].fillna('Unknown')

# Filter out rows with any NAs in the entire dataset
filtered_data = data.dropna().copy()

# Convert specified columns to categorical variables
categorical_columns = [
    'Gender',
    'Race',
    'Age_at_Release',
    'Gang_Affiliated',
    'Supervision_Level_First',
    'Education_Level',
    'Prison_Offense',
    'Prison_Years',
    'Prior_Arrest_Episodes_DVCharges',
    'Prior_Arrest_Episodes_GunCharges',
    'Prior_Conviction_Episodes_Viol',
    'Prior_Conviction_Episodes_PPViolationCharges',
    'Prior_Conviction_Episodes_DomesticViolenceCharges',
    'Prior_Conviction_Episodes_GunCharges',
    'Prior_Revocations_Parole',
    'Prior_Revocations_Probation',
    'Condition_MH_SA',
    'Condition_Cog_Ed',
    'Condition_Other',
    'Violations_ElectronicMonitoring',
    'Violations_Instruction',
    'Violations_FailToReport',
    'Violations_MoveWithoutPermission',
    'Employment_Exempt'
]

for column in categorical_columns:
    filtered_data[column] = filtered_data[column].astype(str)

# Drop Gender (if not needed)
#filtered_data = filtered_data.drop('Gender', axis=1)

# Convert Supervision_Risk_Score_First to a categorical variable
#filtered_data['Supervision_Risk_Score_First'] = filtered_data['Supervision_Risk_Score_First'].astype(str)

# Clean and convert Avg_Days_per_DrugTest to float by removing commas
filtered_data['Avg_Days_per_DrugTest'] = filtered_data['Avg_Days_per_DrugTest'].str.replace(',', '').astype(float)
filtered_data['ID'] = filtered_data['ID'].str.replace(',', '').astype(float)

# Update columns based on the conditions
column_conditions = {
    'Dependents': (3, '3 or more'),
    'Prior_Arrest_Episodes_Felony': (10, '10 or more'),
    'Prior_Arrest_Episodes_Misd': (6, '6 or more'),
    'Prior_Arrest_Episodes_Violent': (3, '3 or more'),
    'Prior_Arrest_Episodes_Property': (5, '5 or more'),
    'Prior_Arrest_Episodes_Drug': (5, '5 or more'),
    'Prior_Arrest_Episodes_PPViolationCharges': (5, '5 or more'),
    'Prior_Conviction_Episodes_Felony': (3, '3 or more'),
    'Prior_Conviction_Episodes_Misd': (4, '4 or more'),
    'Prior_Conviction_Episodes_Prop': (3, '3 or more'),
    'Prior_Conviction_Episodes_Drug': (2, '2 or more'),
    'Delinquency_Reports': (4, '4 or more'),
    'Program_Attendances': (10, '10 or more'),
    'Program_UnexcusedAbsences': (3, '3 or more'),
    'Residence_Changes': (3, '3 or more')
}

# Update the columns based on the conditions
for column, (numeric_value, condition_str) in column_conditions.items():
    filtered_data[column] = filtered_data[column].replace(condition_str, numeric_value).astype(int)

# Columns to exclude from model creation
columns_to_exclude = [
    'Recidivism_Arrest_Year1',
    'Recidivism_Arrest_Year2',
    'Recidivism_Arrest_Year3',
    'Training_Sample',
    'ID'
]

# Keep a separate copy of the excluded columns
excluded_columns_data = filtered_data[columns_to_exclude].copy()

# Drop the excluded columns from the dataset used for model creation
filtered_data_for_model = filtered_data.drop(columns=columns_to_exclude)


In [None]:
filtered_data_for_model.head()

Unnamed: 0,Gender,Race,Age_at_Release,Residence_PUMA,Gang_Affiliated,Supervision_Risk_Score_First,Supervision_Level_First,Education_Level,Dependents,Prison_Offense,...,Residence_Changes,Avg_Days_per_DrugTest,DrugTests_THC_Positive,DrugTests_Cocaine_Positive,DrugTests_Meth_Positive,DrugTests_Other_Positive,Percent_Days_Employed,Jobs_Per_Year,Employment_Exempt,Recidivism_Within_3years
0,M,BLACK,43-47,16,False,3.0,Standard,At least some college,3,Drug,...,2,612.0,0.0,0.0,0.0,0.0,0.488562,0.44761,False,False
1,M,BLACK,33-37,16,False,6.0,Specialized,Less than HS diploma,1,Violent/Non-Sex,...,2,35.666667,0.0,0.0,0.0,0.0,0.425234,2.0,False,True
2,M,BLACK,48 or older,24,False,7.0,High,At least some college,3,Drug,...,0,93.666667,0.333333,0.0,0.166667,0.0,0.0,0.0,False,True
3,M,WHITE,38-42,16,False,7.0,High,Less than HS diploma,1,Property,...,3,25.4,0.0,0.0,0.0,0.0,1.0,0.718996,False,False
4,M,WHITE,33-37,16,False,4.0,Specialized,Less than HS diploma,3,Violent/Non-Sex,...,0,23.117647,0.0,0.0,0.058824,0.0,0.203562,0.929389,False,True


### Model Filtering
Options for one-hot encoded (with or without drop), dummy variable version (with drop), scaled and unscaled of each

In [None]:
# Filter to include only numerical columns and 'Recidivism_Within_3years'
numerical_columns = filtered_data_for_model.select_dtypes(include=[np.number]).columns.tolist()
numerical_columns.append('Recidivism_Within_3years')
numeric_model_filter = filtered_data_for_model[numerical_columns]
print(f"Numeric Model Filter: {numeric_model_filter.shape}")

# Separate filter to include dummy variables and categoricals unscaled
numerical_data = filtered_data_for_model[numerical_columns].reset_index(drop=True)
categorical_data = pd.get_dummies(filtered_data_for_model[categorical_columns], drop_first=True).reset_index(drop=True)
dummy_model_filter = pd.concat([numerical_data, categorical_data], axis=1)
print(f"Dummy Model Filter: {dummy_model_filter.shape}")

# Separate filter to scale all dummy variables and numerical variables
scaler = StandardScaler()
columns_to_scale = dummy_model_filter.columns.difference(['Recidivism_Within_3years'])
dummy_model_filter_scaled = dummy_model_filter.copy()
dummy_model_filter_scaled[columns_to_scale] = scaler.fit_transform(dummy_model_filter[columns_to_scale])
print(f"Scaled Dummy Model Filter: {dummy_model_filter_scaled.shape}")

Numeric Model Filter: (16198, 25)
Dummy Model Filter: (16198, 62)
Scaled Dummy Model Filter: (16198, 62)


In [None]:
# Check the data types of all columns
print(Unscaled One-Hot Encoded Filter.dtypes)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-1-0c6aa5bd7b83>, line 2)

In [None]:
# Identify columns with boolean values
boolean_columns = filtered_data_for_model.select_dtypes(include=bool).columns

# Identify columns with string values
string_columns = filtered_data_for_model.select_dtypes(include=object).columns

print("Boolean columns:", boolean_columns)
print("String columns:", string_columns)


Boolean columns: Index([], dtype='object')
String columns: Index([], dtype='object')


#SVM Models (base code)


In [None]:
# Drop the target variable from the features.ALSO REPLACE (filtered_data_for_model)with appropriate dataset for svm
X = filtered_data_for_model.drop(columns=['Recidivism_Within_3years'])

# Extract the target variable
y = filtered_data_for_model['Recidivism_Within_3years']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
 #Define parameter grids for each kernel
param_grid_linear = {'C': [0.1, 1, 10, 100]}
param_grid_poly = {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4]}
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}


In [None]:
# Initialize GridSearchCV for each kernel
grid_linear = GridSearchCV(SVC(kernel='linear'), param_grid_linear, cv=5)
grid_poly = GridSearchCV(SVC(kernel='poly'), param_grid_poly, cv=5)
grid_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=5)

# Fit the GridSearchCV instances
grid_linear.fit(X_train, y_train)
grid_poly.fit(X_train, y_train)
grid_rbf.fit(X_train, y_train)

NameError: name 'param_grid_linear' is not defined

In [None]:
# Get the best parameters and best scores
best_params_linear = grid_linear.best_params_
best_score_linear = grid_linear.best_score_

best_params_poly = grid_poly.best_params_
best_score_poly = grid_poly.best_score_

best_params_rbf = grid_rbf.best_params_
best_score_rbf = grid_rbf.best_score_

print("Best parameters (Linear Kernel):", best_params_linear)
print("Best score (Linear Kernel):", best_score_linear)

print("\nBest parameters (Polynomial Kernel):", best_params_poly)
print("Best score (Polynomial Kernel):", best_score_poly)

print("\nBest parameters (RBF Kernel):", best_params_rbf)
print("Best score (RBF Kernel):", best_score_rbf)

In [None]:
# Initialize lists to store the times
fit_times = []
score_times = []

# Initialize SVM models with best parameters
svc_linear = SVC(kernel='linear', **best_params_linear)
svc_poly = SVC(kernel='poly', **best_params_poly)
svc_rbf = SVC(kernel='rbf', **best_params_rbf)

# Fit and evaluate the models
for model in [svc_linear, svc_poly, svc_rbf]:
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_times.append(time.time() - start_time)

    start_time = time.time()
    score = model.score(X_test, y_test)
    score_times.append(time.time() - start_time)

# Print the best scores and fit times
print("Best score (Linear Kernel):", linear_score)
print("Time taken to fit (Linear Kernel):", fit_times[0])
print("Time taken to score (Linear Kernel):", score_times[0])

print("\nBest score (Polynomial Kernel):", poly_score)
print("Time taken to fit (Polynomial Kernel):", fit_times[1])
print("Time taken to score (Polynomial Kernel):", score_times[1])

print("\nBest score (RBF Kernel):", rbf_score)
print("Time taken to fit (RBF Kernel):", fit_times[2])
print("Time taken to score (RBF Kernel):", score_times[2])


# SVM Models Full

## dummy_model_filter

In [None]:
# Drop the target variable from the features
X = dummy_model_filter.drop(columns=['Recidivism_Within_3years'])

# Extract the target variable
y = dummy_model_filter['Recidivism_Within_3years']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define parameter grids for each kernel
param_grid_linear = {'C': [0.1, 1, 10, 100]}
param_grid_poly = {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4]}
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}

# Initialize GridSearchCV for each kernel
grid_linear = GridSearchCV(SVC(kernel='linear'), param_grid_linear, cv=5)
grid_poly = GridSearchCV(SVC(kernel='poly'), param_grid_poly, cv=5)
grid_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=5)

# Fit the GridSearchCV instances
grid_linear.fit(X_train, y_train)
grid_poly.fit(X_train, y_train)
grid_rbf.fit(X_train, y_train)


In [None]:
# Get the best parameters and best scores
best_params_linear = grid_linear.best_params_
best_score_linear = grid_linear.best_score_

best_params_poly = grid_poly.best_params_
best_score_poly = grid_poly.best_score_

best_params_rbf = grid_rbf.best_params_
best_score_rbf = grid_rbf.best_score_

print("Best parameters (Linear Kernel):", best_params_linear)
print("Best score (Linear Kernel):", best_score_linear)

print("\nBest parameters (Polynomial Kernel):", best_params_poly)
print("Best score (Polynomial Kernel):", best_score_poly)

print("\nBest parameters (RBF Kernel):", best_params_rbf)
print("Best score (RBF Kernel):", best_score_rbf)


In [None]:
# Initialize lists to store the times
fit_times = []
score_times = []

# Initialize SVM models with best parameters
svc_linear = SVC(kernel='linear', **best_params_linear)
svc_poly = SVC(kernel='poly', **best_params_poly)
svc_rbf = SVC(kernel='rbf', **best_params_rbf)

# Fit and evaluate the models
for model in [svc_linear, svc_poly, svc_rbf]:
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_times.append(time.time() - start_time)

    start_time = time.time()
    score = model.score(X_test, y_test)
    score_times.append(time.time() - start_time)

# Print the best scores and fit times
print("Best score (Linear Kernel):", best_score_linear)
print("Time taken to fit (Linear Kernel):", fit_times[0])
print("Time taken to score (Linear Kernel):", score_times[0])

print("\nBest score (Polynomial Kernel):", best_score_poly)
print("Time taken to fit (Polynomial Kernel):", fit_times[1])
print("Time taken to score (Polynomial Kernel):", score_times[1])

print("\nBest score (RBF Kernel):", best_score_rbf)
print("Time taken to fit (RBF Kernel):", fit_times[2])
print("Time taken to score (RBF Kernel):", score_times[2])

## dummy_model_filter_scaled

In [None]:
# Drop the target variable from the features
X = dummy_model_filter_scaled.drop(columns=['Recidivism_Within_3years'])

# Extract the target variable
y = dummy_model_filter_scaled['Recidivism_Within_3years']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define parameter grids for each kernel
param_grid_linear = {'C': [0.1, 1, 10, 100]}
param_grid_poly = {'C': [0.1, 1, 10, 100], 'degree': [2, 3, 4]}
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}

# Initialize GridSearchCV for each kernel
grid_linear = GridSearchCV(SVC(kernel='linear'), param_grid_linear, cv=5)
grid_poly = GridSearchCV(SVC(kernel='poly'), param_grid_poly, cv=5)
grid_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=5)

# Fit the GridSearchCV instances
grid_linear.fit(X_train, y_train)
grid_poly.fit(X_train, y_train)
grid_rbf.fit(X_train, y_train)


In [None]:
# Get the best parameters and best scores
best_params_linear = grid_linear.best_params_
best_score_linear = grid_linear.best_score_

best_params_poly = grid_poly.best_params_
best_score_poly = grid_poly.best_score_

best_params_rbf = grid_rbf.best_params_
best_score_rbf = grid_rbf.best_score_

print("Best parameters (Linear Kernel):", best_params_linear)
print("Best score (Linear Kernel):", best_score_linear)

print("\nBest parameters (Polynomial Kernel):", best_params_poly)
print("Best score (Polynomial Kernel):", best_score_poly)

print("\nBest parameters (RBF Kernel):", best_params_rbf)
print("Best score (RBF Kernel):", best_score_rbf)


In [None]:
# Initialize lists to store the times
fit_times = []
score_times = []

# Initialize SVM models with best parameters
svc_linear = SVC(kernel='linear', **best_params_linear)
svc_poly = SVC(kernel='poly', **best_params_poly)
svc_rbf = SVC(kernel='rbf', **best_params_rbf)

# Fit and evaluate the models
for model in [svc_linear, svc_poly, svc_rbf]:
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_times.append(time.time() - start_time)

    start_time = time.time()
    score = model.score(X_test, y_test)
    score_times.append(time.time() - start_time)

# Print the best scores and fit times
print("Best score (Linear Kernel):", best_score_linear)
print("Time taken to fit (Linear Kernel):", fit_times[0])
print("Time taken to score (Linear Kernel):", score_times[0])

print("\nBest score (Polynomial Kernel):", best_score_poly)
print("Time taken to fit (Polynomial Kernel):", fit_times[1])
print("Time taken to score (Polynomial Kernel):", score_times[1])

print("\nBest score (RBF Kernel):", best_score_rbf)
print("Time taken to fit (RBF Kernel):", fit_times[2])
print("Time taken to score (RBF Kernel):", score_times[2])