In [1]:
pip install -U scikit-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


**Data cleaning and preprocessing functions**

In [3]:
def clean_fine(value):
    """Convert fine values to float, handling missing and special values"""
    if pd.isna(value) or value == 'MA':
        return 0
    cleaned = str(value).replace('$', '').replace('MA', '').replace(',', '').strip()
    return float(cleaned or 0)

def convert_yes_no_to_bool(value):
    """Convert yes/no strings to boolean values"""
    if isinstance(value, str):
        return value.lower() == 'yes'
    return bool(value)

def load_and_clean_data(filepath):
    """Load and clean the traffic violation dataset"""
    df = pd.read_csv(filepath)
    df['Date Of Stop'] = pd.to_datetime(df['Date Of Stop'], format='%m/%d/%Y')
    df['Fine'] = df['Fine'].apply(clean_fine)
    df['Contr.Acc Fine'] = df['Contr.Acc Fine'].apply(clean_fine)
    df['Total_Fine'] = df.apply(
        lambda row: row['Contr.Acc Fine'] if row['Contributed To Accident'] == True
        else row['Fine'], axis=1
    )
    return df


**Feature engineering and preparation** **bold text**

In [4]:
def prepare_features_for_prediction(df):
    """Prepare features for model training"""
    features_df = df.copy()

    # Define boolean columns
    boolean_columns = [
        'Accident', 'Personal Injury', 'Property Damage',
        'Fatal', 'Work Zone', 'Alcohol', 'HAZMAT',
        'Commercial License', 'Commercial Vehicle'
    ]

    # Convert boolean columns
    for col in boolean_columns:
        features_df[col] = features_df[col].apply(convert_yes_no_to_bool)

    # Process text descriptions
    tfidf = TfidfVectorizer(max_features=250, stop_words='english')
    description_features = tfidf.fit_transform(features_df['Description'].fillna(''))
    description_df = pd.DataFrame(
        description_features.toarray(),
        columns=[f'desc_{i}' for i in range(description_features.shape[1])]
    )

    # Time features
    features_df['Time_Hour'] = pd.to_datetime(features_df['Time Of Stop']).dt.hour

    # Location clustering
    location_data = features_df[['Latitude', 'Longitude']].dropna()
    kmeans_loc = KMeans(n_clusters=5, random_state=42)
    features_df['Location_Cluster'] = kmeans_loc.fit_predict(location_data)

    # Additional feature engineering
    for col in boolean_columns:
        features_df[f'{col}_Flag'] = features_df[col].astype(int)

    features_df['Severity_Score'] = features_df[[f'{col}_Flag' for col in boolean_columns[:7]]].sum(axis=1)
    features_df['Is_Commercial'] = (features_df['Commercial License'] | features_df['Commercial Vehicle']).astype(int)
    features_df['Is_Local'] = (features_df['Driver State'] == features_df['DL State']).astype(int)

    # Encode categorical features
    categorical_columns = ['Violation Type', 'VehicleType', 'SubAgency', 'Make', 'Color']
    le_dict = {}
    for col in categorical_columns:
        le = LabelEncoder()
        features_df[f'{col}_Encoded'] = le.fit_transform(features_df[col].astype(str))
        le_dict[col] = le

    # Vehicle age calculation
    current_year = pd.Timestamp.now().year
    features_df['Vehicle_Age'] = current_year - pd.to_numeric(features_df['Manufacture Year'], errors='coerce')

    # Combine features
    base_features = [
        'Time_Hour', 'Location_Cluster', 'Severity_Score', 'Is_Commercial',
        'Is_Local', 'Points', 'Vehicle_Age', 'Violation Type_Encoded',
        'VehicleType_Encoded', 'SubAgency_Encoded', 'Make_Encoded', 'Color_Encoded'
    ] + [f'{col}_Flag' for col in boolean_columns[:7]]

    X = pd.concat([features_df[base_features], description_df], axis=1)
    features_df['Fine_Category'] = features_df['Fine']//10

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.fillna(0))

    return {
        'features': pd.DataFrame(X_scaled, columns=X.columns),
        'target': features_df['Fine_Category'],
        'label_encoders': le_dict,
        'tfidf': tfidf,
        'scaler': scaler,
        'location_model': kmeans_loc,
        'feature_names': list(X.columns),
        'boolean_columns': boolean_columns
    }

**Model training**

In [5]:
class TrafficFinePredictorTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.model_dict = None

    def train_model(self):
        """Train the traffic fine prediction model"""
        df = load_and_clean_data(self.data_path)
        features_dict = prepare_features_for_prediction(df)

        X = features_dict['features']
        y = features_dict['target']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=30
        )

        model = RandomForestClassifier(
            n_estimators=1500,
            max_depth=550,
            min_samples_split=50,
            min_samples_leaf=20,
            class_weight='balanced',
            random_state=30
        )

        model.fit(X_train, y_train)

        self.model_dict = {
            'model': model,
            'label_encoders': features_dict['label_encoders'],
            'tfidf': features_dict['tfidf'],
            'scaler': features_dict['scaler'],
            'location_model': features_dict['location_model'],
            'boolean_columns': features_dict['boolean_columns'],
            'trained_date': datetime.now(),
            'classes': model.classes_
        }

        return self.model_dict



**Save the trained model**

In [6]:
def save_model(self, path='fine_prediction_model.joblib'):
        if self.model_dict is not None:
            joblib.dump(self.model_dict, path)

In [None]:
trainer = TrafficFinePredictorTrainer("Maryland_Traffic_Violation.csv")
model_dict = trainer.train_model()
trainer.save_model()