In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# Data Preprocessing for Binary Classification
# -------------------------

# Define only the essential columns (adjust as needed)
# Define the essential columns for training (including the target "Class")
cols_to_keep = [
    "ID","Start_Lat","Start_Lng","Severity","Timezone","County","State","Airport_Code","Temperature(F)","Wind_Chill(F)","Humidity(%)","Visibility(mi)","Wind_Speed(mph)","Weather_Condition","Bump",
    "Crossing","Give_Way","Junction","No_Exit","Railway","Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight",
    "Class"  # Target column (with values "Source1" or "Source2")
]

# Load the CSV file using only the selected columns
df = pd.read_csv("Classifying_accidents-train.csv", usecols=cols_to_keep)

# Display an initial snapshot of the data
print("Initial Data Snapshot:")
print(df.head())

# --- Clean Numeric Columns ---
# Convert numeric columns (except the target) to numbers and fill missing values with the median
numeric_cols = [col for col in cols_to_keep if col != "Class"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in '{col}' with median: {median_val}")

# --- Process Target Variable for Binary Classification ---
# For the target "Class", map the values to 0 and 1.
# If there are exactly two unique classes, map the first to 0 and the second to 1.
# Otherwise, you can decide which class(es) to mark as 0 or 1.
if df["Class"].dtype == object:
    unique_classes = df["Class"].unique()
    print("Unique classes before mapping:", unique_classes)
    if len(unique_classes) == 2:
        mapping = {unique_classes[0]: 0, unique_classes[1]: 1}
    else:
        # Example: assign the first unique class as 0, and all others as 1.
        mapping = {unique_classes[0]: 0}
        for cls in unique_classes[1:]:
            mapping[cls] = 1
    df["Class"] = df["Class"].map(mapping)
    print("Mapping for Class:", mapping)
else:
    # If already numeric but with more than 2 unique values, threshold by the median:
    if df["Class"].nunique() > 2:
        threshold = df["Class"].median()
        df["Class"] = (df["Class"] > threshold).astype(int)
        print("Applied thresholding on numeric target using median.")

print("\nProcessed Data Snapshot:")
print(df.head())

# -------------------------
# Model Training (Binary Classification)
# -------------------------

# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
#print("\nRandom Forest Classifier Accuracy:", accuracy)
#print("\nClassification Report:")
#print(classification_report(y_test, y_pred))



Initial Data Snapshot:
          ID  Severity  Start_Lat   Start_Lng            County State  \
0  A-2129372         2  38.770702  -90.275398  St. Louis County    MO   
1  A-1443861         2  44.971004  -93.419037          Hennepin    MN   
2  A-4620777         2  45.843022 -121.061216         Klickitat    WA   
3  A-2704247         2  35.300686  -80.806839       Mecklenburg    NC   
4  A-4843909         2  34.064580 -117.796451       Los Angeles    CA   

     Timezone Airport_Code  Temperature(F)  Wind_Chill(F)  ...  Roundabout  \
0  US/Central         KSTL            69.0           69.0  ...       False   
1  US/Central         KFCM            75.0           75.0  ...       False   
2  US/Pacific         KDLS            31.0           31.0  ...       False   
3  US/Eastern         KJQF            82.4            NaN  ...       False   
4  US/Pacific         KPOC            77.0           77.0  ...       False   

   Station   Stop Traffic_Calming  Traffic_Signal  Sunrise_Sunset  \


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


Filled missing values in 'ID' with median: nan
Filled missing values in 'Start_Lat' with median: 35.823078
Filled missing values in 'Start_Lng' with median: -87.66289499999998
Filled missing values in 'Severity' with median: 2.0
Filled missing values in 'Timezone' with median: nan
Filled missing values in 'County' with median: nan
Filled missing values in 'State' with median: nan
Filled missing values in 'Airport_Code' with median: nan
Filled missing values in 'Temperature(F)' with median: 64.0
Filled missing values in 'Wind_Chill(F)' with median: 62.0
Filled missing values in 'Humidity(%)' with median: 67.0
Filled missing values in 'Visibility(mi)' with median: 10.0
Filled missing values in 'Wind_Speed(mph)' with median: 7.0
Filled missing values in 'Weather_Condition' with median: nan
Filled missing values in 'Bump' with median: 0.0
Filled missing values in 'Crossing' with median: 0.0
Filled missing values in 'Give_Way' with median: 0.0
Filled missing values in 'Junction' with median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

Filled missing values in 'Sunrise_Sunset' with median: nan
Filled missing values in 'Civil_Twilight' with median: nan
Filled missing values in 'Nautical_Twilight' with median: nan
Filled missing values in 'Astronomical_Twilight' with median: nan
Unique classes before mapping: ['Source2' 'Source1']
Mapping for Class: {'Source2': 0, 'Source1': 1}

Processed Data Snapshot:
   ID  Severity  Start_Lat   Start_Lng  County  State  Timezone  Airport_Code  \
0 NaN         2  38.770702  -90.275398     NaN    NaN       NaN           NaN   
1 NaN         2  44.971004  -93.419037     NaN    NaN       NaN           NaN   
2 NaN         2  45.843022 -121.061216     NaN    NaN       NaN           NaN   
3 NaN         2  35.300686  -80.806839     NaN    NaN       NaN           NaN   
4 NaN         2  34.064580 -117.796451     NaN    NaN       NaN           NaN   

   Temperature(F)  Wind_Chill(F)  ...  Roundabout  Station   Stop  \
0            69.0           69.0  ...       False    False  False   
1 

In [6]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nRandom Forest Classifier Accuracy:", accuracy)


Random Forest Classifier Accuracy: 0.8359509422736964


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# -------------------------
# Step 1: Load & Preprocess Training Data
# -------------------------

# Define the essential columns for training (including the target "Class")
train_cols = [
    "ID","Severity","Start_Lat","Start_Lng","Timezone","County","State","Airport_Code","Temperature(F)","Wind_Chill(F)","Humidity(%)","Visibility(mi)","Wind_Speed(mph)","Weather_Condition","Bump",
    "Crossing","Give_Way","Junction","No_Exit","Railway","Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight",
    "Class"  # Target column (with values "Source1" or "Source2")
]


# Load the training data using only the selected columns
train_df = pd.read_csv("Classifying_accidents-train.csv", usecols=train_cols)
print("Training Data Snapshot:")
print(train_df.head())

# List of features to use (all numeric columns except ID and the target)
numeric_features = [
    "Severity","Start_Lat","Start_Lng","Temperature(F)","Wind_Chill(F)","Humidity(%)","Visibility(mi)","Wind_Speed(mph)"
]


# Convert feature columns to numeric and fill missing values with the median
for col in numeric_features:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in training column '{col}' with median: {median_val}")

# Process target variable: Map "Source1" -> 0 and "Source2" -> 1
if train_df["Class"].dtype == object:
    train_df["Class"] = train_df["Class"].map({"Source1": 0, "Source2": 1})
    print("\nTarget mapping applied: {'Source1': 0, 'Source2': 1}")

print("\nProcessed Training Data Snapshot:")
print(train_df.head())

# -------------------------
# Step 2: Balance the Training Data
# -------------------------

# Check the distribution before balancing
print("\nBefore balancing:")
print(train_df["Class"].value_counts())

# Separate majority and minority classes
df_majority = train_df[train_df["Class"] == 0]
df_minority = train_df[train_df["Class"] == 1]

# Downsample the majority class to match the minority class size
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),
                                   random_state=42)

# Combine the downsampled majority class with the minority class
train_balanced = pd.concat([df_majority_downsampled, df_minority])
# Shuffle the balanced dataset
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nAfter balancing:")
print(train_balanced["Class"].value_counts())

# -------------------------
# Step 3: Model Training (Binary Classification)
# -------------------------

# Extract features and target from the balanced dataset
X_balanced = train_balanced[numeric_features]
y_balanced = train_balanced["Class"]

# Split the balanced data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.20, random_state=42)

# Initialize and train the Random Forest Classifier
# Initialize the Random Forest with balanced class weights
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("\nValidation Accuracy:", val_accuracy)
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

# -------------------------
# Step 4: Load & Preprocess Test Data, Predict & Create Submission File
# -------------------------

# Define the essential columns for test data (target column is missing)
test_cols = [
    "ID","Severity","Start_Lat","Start_Lng","Timezone","County","State","Airport_Code","Temperature(F)","Wind_Chill(F)","Humidity(%)","Visibility(mi)","Wind_Speed(mph)","Weather_Condition","Bump",
    "Crossing","Give_Way","Junction","No_Exit","Railway","Roundabout","Station","Stop","Traffic_Calming","Traffic_Signal","Sunrise_Sunset","Civil_Twilight","Nautical_Twilight","Astronomical_Twilight"
]

# Load the test data
test_df = pd.read_csv("Classifying_accidents - test.csv", usecols=test_cols)
print("\nTest Data Snapshot:")
print(test_df.head())

# Process the test data: convert to numeric and fill missing values
for col in numeric_features:
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    median_val = test_df[col].median()
    test_df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in test column '{col}' with median: {median_val}")

# Extract test features
X_test = test_df[numeric_features]

# Predict the binary class for test data
test_predictions = clf.predict(X_test)

# Create a submission DataFrame with "ID" and "Source"
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Source": test_predictions
})

print("\nSubmission Preview:")
print(submission.head())

# Save the submission file
submission.to_csv("submission.csv", index=False)
print("\nSubmission file saved as 'submission.csv'.")



Training Data Snapshot:
          ID  Severity            County State    Timezone Airport_Code  \
0  A-2129372         2  St. Louis County    MO  US/Central         KSTL   
1  A-1443861         2          Hennepin    MN  US/Central         KFCM   
2  A-4620777         2         Klickitat    WA  US/Pacific         KDLS   
3  A-2704247         2       Mecklenburg    NC  US/Eastern         KJQF   
4  A-4843909         2       Los Angeles    CA  US/Pacific         KPOC   

   Temperature(F)  Wind_Chill(F)  Humidity(%)  Visibility(mi)  ...  \
0            69.0           69.0         81.0             8.0  ...   
1            75.0           75.0         76.0            10.0  ...   
2            31.0           31.0         76.0            10.0  ...   
3            82.4            NaN         51.0            10.0  ...   
4            77.0           77.0         22.0            10.0  ...   

   Roundabout Station   Stop  Traffic_Calming  Traffic_Signal  Sunrise_Sunset  \
0       False   False  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_val, inplace=True)


Filled missing values in training column 'Wind_Speed(mph)' with median: 7.0

Target mapping applied: {'Source1': 0, 'Source2': 1}

Processed Training Data Snapshot:
          ID  Severity            County State    Timezone Airport_Code  \
0  A-2129372         2  St. Louis County    MO  US/Central         KSTL   
1  A-1443861         2          Hennepin    MN  US/Central         KFCM   
2  A-4620777         2         Klickitat    WA  US/Pacific         KDLS   
3  A-2704247         2       Mecklenburg    NC  US/Eastern         KJQF   
4  A-4843909         2       Los Angeles    CA  US/Pacific         KPOC   

   Temperature(F)  Wind_Chill(F)  Humidity(%)  Visibility(mi)  ...  \
0            69.0           69.0         81.0             8.0  ...   
1            75.0           75.0         76.0            10.0  ...   
2            31.0           31.0         76.0            10.0  ...   
3            82.4           62.0         51.0            10.0  ...   
4            77.0           77.0  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median_val, inplace=True)


Filled missing values in test column 'Wind_Speed(mph)' with median: 7.0

Submission Preview:
          ID  Source
0  A-1253417       0
1  A-1363407       0
2  A-4050525       0
3  A-3179230       1
4  A-7304136       0

Submission file saved as 'submission.csv'.


In [None]:
print(train_df["Class"].value_counts())

Class
0    2162816
1    1652686
Name: count, dtype: int64
