In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# -------------------------
# Data Preprocessing
# -------------------------
# Load the CSV file (adjust the filename as needed)
df = pd.read_csv("Classifying_accidents-train.csv")

# Display the initial data snapshot
print("Initial data snapshot:")
print(df.head())

# --- Missing Values Handling ---
# Fill missing numeric values with the median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in numeric column '{col}' with median: {median_val}")

# Fill missing categorical (object) values with "Unknown"
object_cols = df.select_dtypes(include=['object']).columns
for col in object_cols:
    df[col].fillna("Unknown", inplace=True)
    print(f"Filled missing values in categorical column '{col}' with 'Unknown'")

# --- Boolean Conversion ---
# Convert boolean columns to integers (True -> 1, False -> 0)
df = df.replace({True: 1, False: 0})

# --- Target Variable Handling ---
# Assume the target column is "Class" (which contains the source information, e.g., Source1 or Source2)
if 'Class' in df.columns:
    # Factorize to convert source labels to numeric codes (e.g., Source1 -> 0, Source2 -> 1)
    df['Class'] = pd.factorize(df['Class'])[0]
    target_column = 'Class'
else:
    target_column = None

# --- Feature Encoding ---
# Separate features from the target column (if it exists)
if target_column:
    features = df.drop(columns=[target_column])
else:
    features = df.copy()

# Identify categorical features (object type) for one-hot encoding
categorical_features = features.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical features for encoding:", categorical_features)

# One-hot encode the categorical features (using drop_first=True to avoid dummy variable trap)
features_encoded = pd.get_dummies(features, columns=categorical_features, drop_first=True)

# --- Reassemble the DataFrame ---
# If a target exists, add it back and ensure it is the last column
if target_column:
    features_encoded[target_column] = df[target_column]
    cols = [col for col in features_encoded.columns if col != target_column] + [target_column]
    df_processed = features_encoded[cols]
else:
    df_processed = features_encoded

# Display the processed DataFrame using .head()
print("\nProcessed data preview:")
print(df_processed.head())

# -------------------------
# Plotting the Data with matplotlib
# -------------------------

# 1. Plot the distribution of the target "Class" column (if it exists)
if 'Class' in df_processed.columns:
    plt.figure(figsize=(8, 6))
    # Create bins so that each unique integer class gets its own bin.
    bins = np.arange(df_processed['Class'].min(), df_processed['Class'].max() + 2) - 0.5
    plt.hist(df_processed['Class'], bins=bins, edgecolor='black')
    plt.title('Distribution of Class')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.xticks(range(df_processed['Class'].min(), df_processed['Class'].max()+1))
    plt.show()

# 2. Plot a correlation heatmap of numeric features
plt.figure(figsize=(12, 10))
corr = df_processed.corr()
plt.imshow(corr, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.columns)
plt.title("Correlation Heatmap")
plt.show()

# 3. Plot a scatter plot for Start Latitude vs. Start Longitude (if they exist)
if 'Start_Lat' in df_processed.columns and 'Start_Lng' in df_processed.columns:
    plt.figure(figsize=(8, 6))
    plt.scatter(df_processed['Start_Lat'], df_processed['Start_Lng'], alpha=0.5)
    plt.title('Scatter plot of Start Latitude vs. Start Longitude')
    plt.xlabel('Start Latitude')
    plt.ylabel('Start Longitude')
    plt.show()

# -------------------------
# Model Training with Random Forest (optional, not plotted)
# -------------------------
if target_column:
    X = df_processed.drop(columns=[target_column])
    y = df_processed[target_column]

    # Split data into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    # Initialize and train the Random Forest Classifier
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set and calculate accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("\nRandom Forest Classifier Accuracy:", accuracy)

# -------------------------
# Model Training with Random Forest
# -------------------------
# Assume the target column is 'Class'
X = df_processed.drop(columns=['Class'])
y = df_processed['Class']

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set and calculate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nRandom Forest Classifier Accuracy:", accuracy)

# Optionally, display the first few rows of the processed data once more
print("\nFinal processed data preview:")
print(df_processed.head())


Initial data snapshot:
          ID  Severity  Start_Lat   Start_Lng    End_Lat     End_Lng  \
0  A-2129372         2  38.770702  -90.275398        NaN         NaN   
1  A-1443861         2  44.971004  -93.419037        NaN         NaN   
2  A-4620777         2  45.843022 -121.061216  45.841329 -121.052174   
3  A-2704247         2  35.300686  -80.806839        NaN         NaN   
4  A-4843909         2  34.064580 -117.796451  34.070030 -117.807601   

   Distance(mi)        Street         City            County  ... Station  \
0         0.000   Pershall Rd  Saint Louis  St. Louis County  ...   False   
1         0.000       I-394 E      Hopkins          Hennepin  ...   False   
2         0.451   Highway 142   Goldendale         Klickitat  ...   False   
3         0.000      Nevin Rd    Charlotte       Mecklenburg  ...   False   
4         0.741       CA-71 N       Pomona       Los Angeles  ...   False   

    Stop Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset  \
0  False  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


Filled missing values in numeric column 'End_Lng' with median: -88.0270915
Filled missing values in numeric column 'Distance(mi)' with median: 0.034
Filled missing values in numeric column 'Temperature(F)' with median: 64.0
Filled missing values in numeric column 'Wind_Chill(F)' with median: 62.0
Filled missing values in numeric column 'Humidity(%)' with median: 67.0
Filled missing values in numeric column 'Pressure(in)' with median: 29.86
Filled missing values in numeric column 'Visibility(mi)' with median: 10.0
Filled missing values in numeric column 'Wind_Speed(mph)' with median: 7.0
Filled missing values in numeric column 'Precipitation(in)' with median: 0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)


Filled missing values in categorical column 'ID' with 'Unknown'
Filled missing values in categorical column 'Street' with 'Unknown'
Filled missing values in categorical column 'City' with 'Unknown'
Filled missing values in categorical column 'County' with 'Unknown'
Filled missing values in categorical column 'State' with 'Unknown'
Filled missing values in categorical column 'Zipcode' with 'Unknown'
Filled missing values in categorical column 'Country' with 'Unknown'
Filled missing values in categorical column 'Timezone' with 'Unknown'
Filled missing values in categorical column 'Airport_Code' with 'Unknown'
Filled missing values in categorical column 'Weather_Timestamp' with 'Unknown'
Filled missing values in categorical column 'Wind_Direction' with 'Unknown'
Filled missing values in categorical column 'Weather_Condition' with 'Unknown'
Filled missing values in categorical column 'Sunrise_Sunset' with 'Unknown'
Filled missing values in categorical column 'Civil_Twilight' with 'Unknown'

  df = df.replace({True: 1, False: 0})



Categorical features for encoding: ['ID', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Wind_Direction', 'Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
