In [1]:
import numpy as np
import pandas as pd

# Define the options for the columns
sex_options = ['Male', 'Female']
race_options = ['African-American', 'Asian', 'Caucasian', 'Hispanic', 'Native American', 'Other']
charge_degree_options = ['F1', 'F2', 'F3', 'F5', 'F6', 'F7', 'M1', 'M2', 'MO3', 'X', 'CT', 'NI0', 'TCX', 'CO3']

# Generate random data for 10k rows
np.random.seed(42)  # For reproducibility

data = {
    'age': np.random.randint(18, 65, size=10000),
    'sex': np.random.choice(sex_options, size=10000),
    'race': np.random.choice(race_options, size=10000),
    'total_juvenile_offenses': np.random.randint(0, 10, size=10000),
    'detention_period': np.random.randint(0, 365, size=10000),  # in days
    'prior_offense_count': np.random.randint(0, 15, size=10000),
    'current_charge_degree': np.random.choice(charge_degree_options, size=10000)
}

df = pd.DataFrame(data)

# Generate 'is_recidivist' (y) based on some random logical conditions
conditions = [
    (df['age'] < 25) & (df['total_juvenile_offenses'] > 2) & (df['prior_offense_count'] > 5),
    (df['age'] < 40) & (df['detention_period'] > 100) & (df['prior_offense_count'] > 3),
    (df['total_juvenile_offenses'] > 5) & (df['prior_offense_count'] > 10)
]

# Assign 1 (recidivist) or 0 (non-recidivist) based on the conditions
df['is_recidivist'] = np.select(conditions, [1, 1, 1], default=0)

df.head()
df.to_csv('current_data/new_cases.csv', index=False)

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

label_encoder = LabelEncoder()
numerical_columns = ['age', 'total_juvenile_offenses', 'detention_period', 'prior_offense_count']

df['sex'] = label_encoder.fit_transform(df['sex'])
df['race'] = label_encoder.fit_transform(df['race'])
df['current_charge_degree'] = label_encoder.fit_transform(df['current_charge_degree'])
scaler = StandardScaler()

# Apply standardization
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [12]:
encoded_df = pd.read_csv('partitions/encoded_data.csv')


In [16]:
combined_df = pd.concat([encoded_df, df], ignore_index=True)


In [22]:
combined_df.tail()

Unnamed: 0.1,Unnamed: 0,Case_id,name,sex,date_of_birth,age,race,juvenile_felony_count,risk_decile_score,juvenile_misdemeanor_count,...,recidivism_risk_decile_score,risk_score_category,screening_date,violence_risk_decile_score,violence_risk_score_category,age_bins,charge_category,sentence_type,total_juvenile_offenses,detention_period
20996,,,,1,,-0.597326,0,,,,...,,,,,,,,,-0.186498,-0.134233
20997,,,,1,,-0.893986,0,,,,...,,,,,,,,,0.517269,-0.48648
20998,,,,0,,-1.709802,5,,,,...,,,,,,,,,0.165385,0.846346
20999,,,,0,,-1.190647,3,,,,...,,,,,,,,,-0.538382,-0.410318
21000,,,,0,,-0.819821,2,,,,...,,,,,,,,,-1.594033,-0.048551


In [25]:
combined_df.to_csv('partitions/encoded_data.csv', index=False)
