In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import mlflow
import mlflow.sklearn


# Function to swap jail dates if needed
def swap_jail_dates_if_needed(row):
    if pd.to_datetime(row['current_jail_release_date']) < pd.to_datetime(row['current_jail_entry_date']):
        row['current_jail_entry_date'], row['current_jail_release_date'] = row['current_jail_release_date'], row['current_jail_entry_date']
    return row

# Load and prepare the dataset
df = pd.read_csv('current_data/Cases.csv')
df = df.apply(swap_jail_dates_if_needed, axis=1)

# Create new features
df['total_juvenile_offenses'] = df['juvenile_felony_count'] + df['juvenile_misdemeanor_count'] + df['juvenile_other_offense_count']
df['current_jail_entry_date'] = pd.to_datetime(df['current_jail_entry_date'])
df['current_jail_release_date'] = pd.to_datetime(df['current_jail_release_date'])
df['detention_period'] = (df['current_jail_release_date'] - df['current_jail_entry_date']).dt.days

df['detention_period'].fillna(0, inplace=True)
df = df[df["is_recidivist"] != -1]

# Encode categorical features
categorical_features = ['sex', 'current_charge_degree']
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Scale numerical features
numerical_features = ['age','detention_period', 'total_juvenile_offenses', 'prior_offense_count']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df.to_csv('encoded.csv', index=False)
# Define features and target
X = df[['age', 'sex', 'total_juvenile_offenses', 'detention_period', 'prior_offense_count', 'current_charge_degree']]
y = df['is_recidivist']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=28)
model.fit(x_train, y_train)
# Save and register the model with MLflow
def mlFlowVersioning(model, model_name):
    with mlflow.start_run() as run:
        mlflow.sklearn.log_model(model, "model")
        model_uri = f"runs:/{run.info.run_id}/model"
        mlflow.register_model(model_uri, model_name)

mlflow.set_tracking_uri("mlruns")
mlFlowVersioning(model, "recidivist_random_forest_model")

# Evaluate the model
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

def predict_recidivism(input_data, model, scaler, label_encoders, categorical_features, numerical_features):
    input_df = pd.DataFrame([input_data])
    
    # Handle categorical features
    for feature in categorical_features:
        if feature in input_df.columns:
            if input_df[feature].dtype == object:  # If it's a string
                if input_df[feature].values[0] not in label_encoders[feature].classes_:
                    print(f"Warning: The input data for {feature} contains unseen labels.")
                    input_df[feature] = label_encoders[feature].classes_[0]
                input_df[feature] = label_encoders[feature].transform(input_df[feature])
            else:  # If it's already a number, assume it's correctly encoded
                input_df[feature] = input_df[feature].astype(int)
    
    # Scale numerical features
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])
    
    # Ensure the input DataFrame has the same columns as the training data
    input_df = input_df[['age', 'sex', 'total_juvenile_offenses', 'detention_period', 'prior_offense_count', 'current_charge_degree']]
    
    # Make prediction
    prediction = model.predict(input_df)
    probability = model.predict_proba(input_df)[0]
    
    return prediction[0], probability

# Example usage
input_data = {
    'age': 35,
    'sex': 'Male',  
    'total_juvenile_offenses': 8,
    'detention_period': 120,
    'prior_offense_count': 6,
    'current_charge_degree': 'F1'  
}

# Make sure these variables are defined in your main code
categorical_features = ['sex', 'current_charge_degree']
numerical_features = ['age', 'detention_period', 'total_juvenile_offenses', 'prior_offense_count']

prediction, probability = predict_recidivism(input_data, model, scaler, label_encoders, categorical_features, numerical_features)
print(f"Prediction (1 means recidivist, 0 means not recidivist): {prediction}")
print(f"Probability of being a recidivist: {probability[1]:.2f}")

Registered model 'recidivist_random_forest_model' already exists. Creating a new version of this model...
Created version '55' of model 'recidivist_random_forest_model'.


Model Accuracy: 0.6860183841315917
RMSE: 0.5603406248599224
F1 Score: 0.6536500755314237
Prediction (1 means recidivist, 0 means not recidivist): 1
Probability of being a recidivist: 0.54
