In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# Data Preprocessing for Binary Classification
# -------------------------

# Define only the essential columns (adjust as needed)
cols_to_keep = [
    "Severity", "Start_Lat", "Start_Lng", "End_Lat", "End_Lng",
    "Distance(mi)", "Temperature(F)", "Wind_Chill(F)", "Humidity(%)",
    "Pressure(in)", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)",
    "Class"  # Target column for binary classification
]

# Load the CSV file using only the selected columns
df = pd.read_csv("Classifying_accidents-train.csv", usecols=cols_to_keep)

# Display an initial snapshot of the data
print("Initial Data Snapshot:")
print(df.head())

# --- Clean Numeric Columns ---
# Convert numeric columns (except the target) to numbers and fill missing values with the median
numeric_cols = [col for col in cols_to_keep if col != "Class"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in '{col}' with median: {median_val}")

# --- Process Target Variable for Binary Classification ---
# For the target "Class", map the values to 0 and 1.
# If there are exactly two unique classes, map the first to 0 and the second to 1.
# Otherwise, you can decide which class(es) to mark as 0 or 1.
if df["Class"].dtype == object:
    unique_classes = df["Class"].unique()
    print("Unique classes before mapping:", unique_classes)
    if len(unique_classes) == 2:
        mapping = {unique_classes[0]: 0, unique_classes[1]: 1}
    else:
        # Example: assign the first unique class as 0, and all others as 1.
        mapping = {unique_classes[0]: 0}
        for cls in unique_classes[1:]:
            mapping[cls] = 1
    df["Class"] = df["Class"].map(mapping)
    print("Mapping for Class:", mapping)
else:
    # If already numeric but with more than 2 unique values, threshold by the median:
    if df["Class"].nunique() > 2:
        threshold = df["Class"].median()
        df["Class"] = (df["Class"] > threshold).astype(int)
        print("Applied thresholding on numeric target using median.")

print("\nProcessed Data Snapshot:")
print(df.head())

# -------------------------
# Model Training (Binary Classification)
# -------------------------

# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nRandom Forest Classifier Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Initial data snapshot:
          ID  Severity  Start_Lat   Start_Lng    End_Lat     End_Lng  \
0  A-2129372         2  38.770702  -90.275398        NaN         NaN   
1  A-1443861         2  44.971004  -93.419037        NaN         NaN   
2  A-4620777         2  45.843022 -121.061216  45.841329 -121.052174   
3  A-2704247         2  35.300686  -80.806839        NaN         NaN   
4  A-4843909         2  34.064580 -117.796451  34.070030 -117.807601   

   Distance(mi)        Street         City            County  ... Station  \
0         0.000   Pershall Rd  Saint Louis  St. Louis County  ...   False   
1         0.000       I-394 E      Hopkins          Hennepin  ...   False   
2         0.451   Highway 142   Goldendale         Klickitat  ...   False   
3         0.000      Nevin Rd    Charlotte       Mecklenburg  ...   False   
4         0.741       CA-71 N       Pomona       Los Angeles  ...   False   

    Stop Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset  \
0  False  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


Filled missing values in numeric column 'End_Lng' with median: -88.0270915
Filled missing values in numeric column 'Distance(mi)' with median: 0.034
Filled missing values in numeric column 'Temperature(F)' with median: 64.0
Filled missing values in numeric column 'Wind_Chill(F)' with median: 62.0
Filled missing values in numeric column 'Humidity(%)' with median: 67.0
Filled missing values in numeric column 'Pressure(in)' with median: 29.86
Filled missing values in numeric column 'Visibility(mi)' with median: 10.0
Filled missing values in numeric column 'Wind_Speed(mph)' with median: 7.0
Filled missing values in numeric column 'Precipitation(in)' with median: 0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)


Filled missing values in categorical column 'ID' with 'Unknown'
Filled missing values in categorical column 'Street' with 'Unknown'
Filled missing values in categorical column 'City' with 'Unknown'
Filled missing values in categorical column 'County' with 'Unknown'
Filled missing values in categorical column 'State' with 'Unknown'
Filled missing values in categorical column 'Zipcode' with 'Unknown'
Filled missing values in categorical column 'Country' with 'Unknown'
Filled missing values in categorical column 'Timezone' with 'Unknown'
Filled missing values in categorical column 'Airport_Code' with 'Unknown'
Filled missing values in categorical column 'Weather_Timestamp' with 'Unknown'
Filled missing values in categorical column 'Wind_Direction' with 'Unknown'
Filled missing values in categorical column 'Weather_Condition' with 'Unknown'
Filled missing values in categorical column 'Sunrise_Sunset' with 'Unknown'
Filled missing values in categorical column 'Civil_Twilight' with 'Unknown'

  df = df.replace({True: 1, False: 0})



Categorical features for encoding: ['ID', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Wind_Direction', 'Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']


: 