In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# -------------------------
# Data Preprocessing for Binary Classification
# -------------------------

# Define only the essential columns (including the target "Class")
cols_to_keep = [
    "Severity", "Start_Lat", "Start_Lng", "End_Lat", "End_Lng", "Timezone", "County", "State", "Airport_Code",
    "Temperature(F)", "Wind_Chill(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)",
    "Weather_Condition", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop",
    "Traffic_Calming", "Traffic_Signal", "Sunrise_Sunset", "Civil_Twilight", "Nautical_Twilight", "Astronomical_Twilight",
    "Class"  # Target column
]

# Load the dataset using only the selected columns
df = pd.read_csv("Classifying_accidents-train.csv", usecols=cols_to_keep)

# Convert boolean features to integers
bool_cols = ["Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", 
             "Traffic_Calming", "Traffic_Signal"]
df[bool_cols] = df[bool_cols].astype(int)

# Convert categorical features to numeric using Label Encoding
categorical_cols = ["Timezone", "County", "State", "Airport_Code", "Weather_Condition", "Sunrise_Sunset", 
                    "Civil_Twilight", "Nautical_Twilight", "Astronomical_Twilight"]
label_encoders = {}
for col in categorical_cols:
    df[col] = df[col].astype(str)  # Ensure all values are strings
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for future use

# Handle missing values by filling numeric columns with the median
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Convert target variable "Class" to binary values
if df["Class"].dtype == object:
    unique_classes = df["Class"].unique()
    mapping = {unique_classes[0]: 0, unique_classes[1]: 1} if len(unique_classes) == 2 else {cls: i for i, cls in enumerate(unique_classes)}
    df["Class"] = df["Class"].map(mapping)

# -------------------------
# Model Training (80/20 Rule for Train-Test Split)
# -------------------------

# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = clf.predict(X_test)

#print("\nRandom Forest Classifier Accuracy:", accuracy)
#print("\nClassification Report:")
#print(classification_report(y_test, y_pred))



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [12]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nRandom Forest Classifier Accuracy:", accuracy)


Random Forest Classifier Accuracy: 1.0


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# -------------------------
# Step 1: Load & Preprocess Training Data
# -------------------------

# Load dataset
train_df = pd.read_csv("Classifying_accidents-train.csv")

# Drop non-informative columns
drop_cols = ["ID", "Street", "City", "Zipcode", "Country", "Weather_Timestamp"]
train_df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Convert boolean features to integers
bool_cols = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit",
             "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", 
             "Traffic_Signal", "Turning_Loop"]
train_df[bool_cols] = train_df[bool_cols].astype(int)

# Convert categorical features to numeric using Label Encoding
categorical_cols = ["State", "County", "Timezone", "Airport_Code", "Wind_Direction",
                    "Weather_Condition", "Sunrise_Sunset", "Civil_Twilight", 
                    "Nautical_Twilight", "Astronomical_Twilight"]
label_encoders = {}
for col in categorical_cols:
    train_df[col] = train_df[col].astype(str)  # Ensure all values are strings
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le  # Store encoder for later use in test data

# Handle missing values
for col in train_df.columns:
    if train_df[col].dtype in [np.float64, np.int64]:  # Numeric columns
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
    else:  # Categorical columns
        mode_val = train_df[col].mode()[0]
        train_df[col].fillna(mode_val, inplace=True)

# Process target variable: Map "Source1" -> 0 and "Source2" -> 1
train_df["Class"] = train_df["Class"].map({"Source1": 0, "Source2": 1})

# -------------------------
# Step 2: Balance the Training Data
# -------------------------

# Check class distribution
print("\nBefore balancing:")
print(train_df["Class"].value_counts())

# Separate majority and minority classes
df_majority = train_df[train_df["Class"] == 0]
df_minority = train_df[train_df["Class"] == 1]

# Downsample the majority class to match the minority class size
df_majority_downsampled = resample(df_majority, replace=False,
                                   n_samples=len(df_minority), random_state=42)

# Combine the downsampled majority class with the minority class
train_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42)

print("\nAfter balancing:")
print(train_balanced["Class"].value_counts())

# -------------------------
# Step 3: Model Training
# -------------------------

# Extract features and target
X_balanced = train_balanced.drop(columns=["Class"])
y_balanced = train_balanced["Class"]

# Split into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.20, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_val_pred = clf.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

# -------------------------
# Step 4: Load & Preprocess Test Data
# -------------------------

test_df = pd.read_csv("Classifying_accidents - test.csv")

# Drop unnecessary columns (ensure 'ID' is kept for submission)
drop_cols = ["Street", "City", "Zipcode", "Country", "Weather_Timestamp"]
test_df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Convert boolean features to integers
test_df[bool_cols] = test_df[bool_cols].astype(int)

# Apply the same label encoding to categorical features as training data
for col in categorical_cols:
    test_df[col] = test_df[col].astype(str)  # Ensure all values are strings
    test_df[col] = label_encoders[col].transform(test_df[col])

# Handle missing values in test data (same strategy as training)
for col in test_df.columns:
    if test_df[col].dtype in [np.float64, np.int64]:  # Numeric columns
        median_val = test_df[col].median()
        test_df[col].fillna(median_val, inplace=True)
    else:  # Categorical columns
        mode_val = test_df[col].mode()[0]
        test_df[col].fillna(mode_val, inplace=True)

# Extract features for prediction
X_test = test_df.drop(columns=["ID"])

# Predict on test data
test_predictions = clf.predict(X_test)

# Create a submission DataFrame
submission = pd.DataFrame({
    "ID": test_df["ID"],  # Ensure 'ID' is preserved for submission
    "Source": test_predictions
})

# Save the submission file
submission.to_csv("submission.csv", index=False)
print("\nSubmission file saved as 'submission.csv'.")





KeyboardInterrupt: 

In [None]:
print(train_df["Class"].value_counts())

Class
0    2162816
1    1652686
Name: count, dtype: int64
