# Statistical Analysis of Prior Commitments Dataset

This notebook performs comprehensive statistical analysis on criminal prior commitments data, including:

- **Descriptive Statistics**: Overview of the dataset structure and basic metrics
- **Categorical Analysis**: Distribution of offense types, relationships, and prison status
- **Time-Based Analysis**: Temporal patterns in offenses and time served
- **Statistical Tests**: Chi-square, t-tests, correlation analysis, and Kruskal-Wallis tests
- **Recidivism Analysis**: Patterns of repeat offenses

---

## Setup and Dependencies

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu, kruskal
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully!")

## Load Dataset

The dataset should be placed in the `data/` folder at the project root.

In [None]:
# Load the dataset
# Adjust the path if running from a different directory
DATA_PATH = '../data/prior_commitments.csv'

print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

print(f"\nDataset loaded successfully!")
print(f"Total records: {len(df):,}")
print(f"Total columns: {len(df.columns)}")

---
## 1. Descriptive Statistics

Initial exploration of the dataset structure, data types, and basic statistics.

In [None]:
# Dataset structure
print("Dataset Info:")
print(df.info())

In [None]:
# Preview first few rows
print("First few rows:")
df.head()

In [None]:
# Column names
print("Column names:")
print(df.columns.tolist())

In [None]:
# Missing values analysis
print("Missing values per column:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Basic statistics for numerical columns
print("Basic statistics for numerical columns:")
df.describe()

---
## 2. Categorical Variable Analysis

Examining the distribution of categorical variables in the dataset.

In [None]:
# Define categorical columns to analyze
categorical_cols = ['sentencing county', 'offense', 'offense description', 
                    'offense category', 'in prison', 'relationship']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{'='*50}")
        print(f"{col.upper()}")
        print(f"{'='*50}")
        print(f"\nTop 10 values:")
        print(df[col].value_counts().head(10))
        print(f"\nUnique values: {df[col].nunique()}")

---
## 3. Time-Based Analysis

Analyzing temporal patterns in the data, including offense dates and time served.

In [None]:
# Convert date columns to datetime
date_cols = ['offense begin date', 'offense end date', 'release date']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        print(f"Converted '{col}' to datetime")

In [None]:
# Extract year from offense begin date and analyze trends
if 'offense begin date' in df.columns:
    df['offense_year'] = df['offense begin date'].dt.year
    
    print("Offenses by Year (Last 20 years):")
    yearly_counts = df['offense_year'].value_counts().sort_index().tail(20)
    print(yearly_counts)
    
    # Visualization
    plt.figure(figsize=(14, 6))
    yearly_counts.plot(kind='bar', color='steelblue', edgecolor='black')
    plt.title('Number of Offenses by Year', fontsize=14)
    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Calculate time served
if 'offense begin date' in df.columns and 'release date' in df.columns:
    df['days_served'] = (df['release date'] - df['offense begin date']).dt.days
    
    print("Time Served Statistics (in days):")
    print(df['days_served'].describe())
    
    # Visualization
    plt.figure(figsize=(12, 5))
    df['days_served'].dropna().clip(upper=df['days_served'].quantile(0.95)).hist(bins=50, color='steelblue', edgecolor='black')
    plt.title('Distribution of Days Served (95th percentile cap)', fontsize=14)
    plt.xlabel('Days Served')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

---
## 4. Chi-Square Tests

Testing for associations between categorical variables using Chi-square tests of independence.

In [None]:
# Test 1: Offense Category vs Relationship Type
if 'offense category' in df.columns and 'relationship' in df.columns:
    print("Chi-Square Test: Offense Category vs Relationship Type")
    print("="*60)
    
    contingency_table = pd.crosstab(df['offense category'], df['relationship'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    print(f"\nChi-square statistic: {chi2:.4f}")
    print(f"P-value: {p_value:.4e}")
    print(f"Degrees of freedom: {dof}")
    print(f"\nResult: {'Statistically Significant' if p_value < 0.05 else 'Not Statistically Significant'} at α=0.05")
    
    print("\nContingency Table:")
    display(contingency_table)

In [None]:
# Test 2: Offense Category vs In Prison Status
if 'offense category' in df.columns and 'in prison' in df.columns:
    print("Chi-Square Test: Offense Category vs In Prison Status")
    print("="*60)
    
    # Filter out empty values
    df_filtered = df[df['in prison'].notna() & (df['in prison'] != '')]
    
    if len(df_filtered) > 0:
        contingency_table2 = pd.crosstab(df_filtered['offense category'], df_filtered['in prison'])
        chi2_2, p_value_2, dof_2, expected_2 = chi2_contingency(contingency_table2)
        
        print(f"\nChi-square statistic: {chi2_2:.4f}")
        print(f"P-value: {p_value_2:.4e}")
        print(f"Degrees of freedom: {dof_2}")
        print(f"\nResult: {'Statistically Significant' if p_value_2 < 0.05 else 'Not Statistically Significant'} at α=0.05")

---
## 5. T-Tests

Comparing mean time served between different offense categories using independent samples t-tests.

In [None]:
# Compare days served between different offense categories
if 'days_served' in df.columns and 'offense category' in df.columns:
    print("T-Test: Days Served - Drug Crimes vs Property Crimes")
    print("="*60)
    
    drug_crimes = df[df['offense category'] == 'Drug Crimes']['days_served'].dropna()
    property_crimes = df[df['offense category'] == 'Property Crimes']['days_served'].dropna()
    
    if len(drug_crimes) > 0 and len(property_crimes) > 0:
        t_stat, p_value_t = ttest_ind(drug_crimes, property_crimes)
        
        print(f"\nDrug Crimes:")
        print(f"  - Sample size: {len(drug_crimes):,}")
        print(f"  - Mean days served: {drug_crimes.mean():.2f}")
        print(f"  - Std deviation: {drug_crimes.std():.2f}")
        
        print(f"\nProperty Crimes:")
        print(f"  - Sample size: {len(property_crimes):,}")
        print(f"  - Mean days served: {property_crimes.mean():.2f}")
        print(f"  - Std deviation: {property_crimes.std():.2f}")
        
        print(f"\nT-statistic: {t_stat:.4f}")
        print(f"P-value: {p_value_t:.4e}")
        print(f"\nResult: {'Statistically Significant difference' if p_value_t < 0.05 else 'No statistically significant difference'} at α=0.05")

---
## 6. Correlation Analysis

Examining relationships between numerical variables, particularly the relationship between prior commitments and time served.

In [None]:
# Count prior offenses per individual
if 'cdcno' in df.columns:
    prior_counts = df.groupby('cdcno').size().reset_index(name='prior_commitment_count')
    
    print("Prior Commitment Count Statistics:")
    print(prior_counts['prior_commitment_count'].describe())
    
    # Merge back to main dataframe
    df = df.merge(prior_counts, on='cdcno', how='left')

In [None]:
# Correlation between prior commitments and time served
if 'prior_commitment_count' in df.columns and 'days_served' in df.columns:
    print("Correlation Analysis: Prior Commitments vs Days Served")
    print("="*60)
    
    # Pearson correlation
    correlation = df[['prior_commitment_count', 'days_served']].corr()
    print("\nPearson Correlation Matrix:")
    print(correlation)
    
    # Spearman correlation (non-parametric)
    df_corr = df[['prior_commitment_count', 'days_served']].dropna()
    spearman_corr, spearman_p = stats.spearmanr(
        df_corr['prior_commitment_count'], 
        df_corr['days_served']
    )
    
    print(f"\nSpearman Correlation: {spearman_corr:.4f}")
    print(f"P-value: {spearman_p:.4e}")
    print(f"\nResult: {'Statistically Significant correlation' if spearman_p < 0.05 else 'No statistically significant correlation'} at α=0.05")

---
## 7. Kruskal-Wallis Test

Non-parametric test to compare days served across multiple offense categories (alternative to one-way ANOVA when assumptions are violated).

In [None]:
if 'days_served' in df.columns and 'offense category' in df.columns:
    print("Kruskal-Wallis Test: Days Served across Offense Categories")
    print("="*60)
    
    # Get groups
    groups = []
    group_names = []
    categories = df['offense category'].dropna().unique()
    
    for cat in categories:
        group_data = df[df['offense category'] == cat]['days_served'].dropna()
        if len(group_data) > 0:
            groups.append(group_data)
            group_names.append(cat)
    
    if len(groups) > 1:
        h_stat, p_value_kw = kruskal(*groups)
        
        print(f"\nNumber of groups: {len(groups)}")
        print(f"H-statistic: {h_stat:.4f}")
        print(f"P-value: {p_value_kw:.4e}")
        print(f"\nResult: {'Statistically Significant difference' if p_value_kw < 0.05 else 'No statistically significant difference'} at α=0.05")
        
        # Show median days served by category
        print("\nMedian Days Served by Category:")
        median_by_cat = df.groupby('offense category')['days_served'].median().sort_values(ascending=False)
        print(median_by_cat)

---
## 8. Distribution Analysis

Visualizing the distribution of key categorical variables.

In [None]:
if 'offense category' in df.columns:
    print("Offense Category Distribution")
    print("="*60)
    
    offense_dist = df['offense category'].value_counts()
    offense_pct = (offense_dist / len(df) * 100).round(2)
    
    dist_df = pd.DataFrame({
        'Count': offense_dist,
        'Percentage': offense_pct
    })
    print(dist_df)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    offense_dist.plot(kind='barh', color='steelblue', edgecolor='black')
    plt.title('Distribution of Offense Categories', fontsize=14)
    plt.xlabel('Count')
    plt.ylabel('Offense Category')
    plt.tight_layout()
    plt.show()

In [None]:
if 'relationship' in df.columns:
    print("Relationship Type Distribution")
    print("="*60)
    
    relationship_dist = df['relationship'].value_counts()
    relationship_pct = (relationship_dist / len(df) * 100).round(2)
    
    dist_df = pd.DataFrame({
        'Count': relationship_dist,
        'Percentage': relationship_pct
    })
    print(dist_df)
    
    # Visualization
    plt.figure(figsize=(10, 6))
    relationship_dist.head(10).plot(kind='bar', color='coral', edgecolor='black')
    plt.title('Top 10 Relationship Types', fontsize=14)
    plt.xlabel('Relationship Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

---
## 9. Recidivism Analysis

Analyzing patterns of repeat offenses among individuals in the dataset.

In [None]:
if 'cdcno' in df.columns:
    print("Recidivism Analysis")
    print("="*60)
    
    recidivism_data = df.groupby('cdcno').agg({
        'offense begin date': 'count',
        'days_served': 'mean',
        'offense category': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
    }).reset_index()
    recidivism_data.columns = ['cdcno', 'total_offenses', 'avg_days_served', 'most_common_offense']
    
    print(f"\nTotal unique individuals: {len(recidivism_data):,}")
    print(f"Average offenses per person: {recidivism_data['total_offenses'].mean():.2f}")
    print(f"Median offenses per person: {recidivism_data['total_offenses'].median():.2f}")
    print(f"Max offenses by single individual: {recidivism_data['total_offenses'].max()}")
    
    print("\nDistribution of repeat offenses:")
    repeat_dist = recidivism_data['total_offenses'].value_counts().sort_index().head(10)
    print(repeat_dist)
    
    # Visualization
    plt.figure(figsize=(12, 5))
    repeat_dist.plot(kind='bar', color='darkgreen', edgecolor='black')
    plt.title('Distribution of Number of Offenses per Individual', fontsize=14)
    plt.xlabel('Number of Offenses')
    plt.ylabel('Number of Individuals')
    plt.tight_layout()
    plt.show()

---
## 10. Summary Report

Consolidated summary of key findings from the analysis.

In [None]:
print("="*80)
print("SUMMARY REPORT")
print("="*80)

# Create summary dictionary
summary = {
    'Total Records': f"{len(df):,}",
    'Unique Individuals': f"{df['cdcno'].nunique():,}" if 'cdcno' in df.columns else 'N/A',
    'Date Range': f"{df['offense begin date'].min().strftime('%Y-%m-%d')} to {df['offense begin date'].max().strftime('%Y-%m-%d')}" if 'offense begin date' in df.columns else 'N/A',
    'Most Common Offense Category': df['offense category'].mode()[0] if 'offense category' in df.columns else 'N/A',
    'Average Days Served': f"{df['days_served'].mean():.2f}" if 'days_served' in df.columns else 'N/A',
    'Median Days Served': f"{df['days_served'].median():.2f}" if 'days_served' in df.columns else 'N/A'
}

for key, value in summary.items():
    print(f"{key}: {value}")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)