<a href="https://colab.research.google.com/github/Priyadharshan-SC/Catalyst-Crew/blob/main/ROAD_SAFETY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
# 'xgboost' is not available, so we use its scikit-learn equivalent
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def train_on_new_dataset():
    """
    Loads, preprocesses, and trains a Gradient Boosting model on the
    'Road.csv' dataset.
    """

    file_name = "/content/Road.csv" # The file from the Kaggle link

    # --- 1. Load Data ---
    try:
        df = pd.read_csv(file_name)
        print(f"Successfully loaded '{file_name}'.")
    except FileNotFoundError:
        print(f"--- ERROR ---")
        print(f"File not found: '{file_name}'")
        print("Please upload the 'Road.csv' file from the Kaggle link first.")
        return
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # --- 2. Preprocessing ---

    # Define Target (y) and Features (X)
    target_column = 'Accident_severity' # The target in this new dataset

    if target_column not in df.columns:
        print(f"Error: Target column '{target_column}' not found in the file.")
        return

    # Encode the target variable (e.g., "Slight Injury", "Serious Injury")
    le = LabelEncoder()
    y_encoded = le.fit_transform(df[target_column])

    X = df.drop(columns=[target_column])

    # Drop columns that are text-based or hard to use directly
    # 'Time' is a string 'HH:MM:SS' that needs feature engineering,
    # so we drop it for this baseline model.
    cols_to_drop = ['Time']

    # Find which of these columns actually exist in the DataFrame to avoid errors
    existing_cols_to_drop = [col for col in cols_to_drop if col in X.columns]
    X = X.drop(columns=existing_cols_to_drop)

    # Automatically find numeric and categorical feature names
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    print(f"Found {len(numeric_features)} numeric features.")
    print(f"Found {len(categorical_features)} categorical features.")

    # Create preprocessing pipelines for both data types

    # Numeric transformer: fills missing values with the median
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])

    # Categorical transformer: fills missing values and then one-hot encodes
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Bundle preprocessing for numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough' # Keep any columns we didn't specify
    )

    # --- 3. Define the Gradient Boosting Model ---

    # We use GradientBoostingClassifier as the alternative to XGBoost
    model = GradientBoostingClassifier(
        n_estimators=150,       # Number of trees. Start here.
        max_depth=5,            # Max depth of each tree.
        learning_rate=0.1,
        subsample=0.8,
        max_features=0.8,       # Equivalent to 'colsample_bytree'
        random_state=42
    )

    # --- 4. Create and Train the Full Pipeline ---

    clf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    print("\nTraining the model on 'Road.csv'...")
    clf_pipeline.fit(X_train, y_train)

    # --- 5. Evaluate the Model ---
    y_pred = clf_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("\n--- Model Evaluation ---")
    print(f"Target Accuracy Range: 82.00% - 88.00%")
    print(f"Final Model Accuracy: {accuracy * 100:.2f}%")

    if 0.82 <= accuracy <= 0.88:
        print("✅ Success! Accuracy is within the target range.")
    else:
        print("\nThis is your baseline accuracy. Now you can tune it!")
        print("To improve accuracy, try increasing 'n_estimators' (e.g., to 250) or 'max_depth' (e.g., to 7).")
        print("If accuracy is too high (overfitting), try decreasing them.")

# --- Run the script ---
if __name__ == "__main__":
    train_on_new_dataset()

Successfully loaded '/content/Road.csv'.
Found 2 numeric features.
Found 28 categorical features.

Training the model on 'Road.csv'...

--- Model Evaluation ---
Target Accuracy Range: 82.00% - 88.00%
Final Model Accuracy: 84.54%
✅ Success! Accuracy is within the target range.
