In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from tqdm import tqdm  # Progress bar

# Load training data
df_train = pd.read_csv("Classifying_accidents-train.csv")

# Drop high-cardinality columns & unnecessary features
# Drop irrelevant columns
drop_cols = ["ID", "Zipcode", "Country", "Start_Lat", "Start_Lng", "End_Lat", "End_Lng", "Weather_Timestamp", 
             "Street", "City", "County", "Nautical_Twilight", "Astronomical_Twilight", "Turning_Loop", "Amenity",
             "Bump", "Give_Way", "No_Exit", "Railway", "Civil_Twilight", "Roundabout", "Traffic_Calming","Timezone",  "Airport_Code", ]

df_train.drop(columns=drop_cols, inplace=True)

# Map target column
df_train["Class"] = df_train["Class"].map({"Source1": 0, "Source2": 1})

# Define feature sets
cat_features = ["State", "Wind_Direction", "Weather_Condition", 
                "Sunrise_Sunset", "Crossing",
                "Junction", "Station", "Stop", 
                 "Traffic_Signal"]

num_features = ["Severity", "Distance(mi)", "Temperature(F)", "Humidity(%)", "Pressure(in)", 
                "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)"]


# Numerical transformation
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical transformation
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

# Combine transformers
preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
], n_jobs=-1)

# Optimized Model
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=50, max_features="sqrt", n_jobs=-1, random_state=42))
])

# Train-test split
X_train = df_train.drop(columns=["Class"])
y_train = df_train["Class"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.001, random_state=42)

# Reduce dataset for initial training (optional for debugging)
X_train = X_train.sample(n=100000, random_state=42)
y_train = y_train.loc[X_train.index]

# Fit model with progress bar
with tqdm(total=len(X_train), desc="Training Progress", unit="sample") as pbar:
    model.fit(X_train, y_train)
    pbar.update(len(X_train))

# Evaluate model
accuracy = model.score(X_test, y_test) * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Load test data
df_test = pd.read_csv("Classifying_accidents - test.csv")
test_ids = df_test["ID"]
df_test.drop(columns=drop_cols, inplace=True)

# Predict
test_preds = model.predict(df_test)

# Create submission file
submission = pd.DataFrame({"ID": test_ids, "Source": test_preds})
submission.to_csv("submission.csv", index=False)

Training Progress: 100%|██████████| 100000/100000 [00:08<00:00, 12336.10sample/s]


Model Accuracy: 94.08%


In [52]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from tqdm import tqdm  # Progress bar

# Load training data
df_train = pd.read_csv("Classifying_accidents-train.csv")

# Drop high-cardinality columns & unnecessary features
drop_cols = ["ID", "Zipcode", "Country", "Start_Lat", "Start_Lng", "End_Lat", "End_Lng", "Weather_Timestamp", 
             "Street", "City", "County", "Nautical_Twilight", "Astronomical_Twilight", "Turning_Loop", "Amenity",
             "Bump", "Give_Way", "No_Exit", "Railway", "Civil_Twilight", "Roundabout", "Traffic_Calming", "Timezone", "Airport_Code"]

df_train.drop(columns=drop_cols, inplace=True)

# Map target column
df_train["Class"] = df_train["Class"].map({"Source1": 0, "Source2": 1})

# Define feature sets
cat_features = ["State", "Wind_Direction", "Weather_Condition", 
                "Sunrise_Sunset", "Crossing",
                "Junction", "Station", "Stop", 
                "Traffic_Signal"]

num_features = ["Severity", "Distance(mi)", "Temperature(F)", "Humidity(%)", "Pressure(in)", 
                "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)"]

# Numerical transformation
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Train-test split
X_train = df_train.drop(columns=["Class"])
y_train = df_train["Class"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

# Target encoding for categorical features
encoder = TargetEncoder()
X_train[cat_features] = encoder.fit_transform(X_train[cat_features], y_train)
X_test[cat_features] = encoder.transform(X_test[cat_features])

# Preprocess data
preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features)
], n_jobs=-1)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Define improved parameter grid for randomized search
param_grid = {
    'n_estimators': [1000, 2000],
    'learning_rate': [0.01, 0.05],
    'num_leaves': [50, 70],
    'max_depth': [15],
    'min_child_samples': [10, 20],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform randomized search with more iterations and cross-validation folds
random_search = RandomizedSearchCV(lgb.LGBMClassifier(random_state=42, n_jobs=-1, device='gpu'), param_grid, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train_transformed, y_train)

# Get best model
best_model = random_search.best_estimator_

# Evaluate model
accuracy = best_model.score(X_test_transformed, y_test) * 100
print(f"Best Model Accuracy: {accuracy:.2f}%")

# Load test data
df_test = pd.read_csv("Classifying_accidents - test.csv")
test_ids = df_test["ID"]
df_test.drop(columns=drop_cols, inplace=True)

# Apply target encoding to test data
X_test_final = df_test.copy()
X_test_final[cat_features] = encoder.transform(X_test_final[cat_features])
X_test_final = preprocessor.transform(X_test_final)

# Predict
test_preds = best_model.predict(X_test_final)

# Create submission file
submission = pd.DataFrame({"ID": test_ids, "Source": test_preds})
submission.to_csv("submission.csv", index=False)


[LightGBM] [Info] Number of positive: 1322109, number of negative: 1730292
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1015
[LightGBM] [Info] Number of data points in the train set: 3052401, number of used features: 8
[LightGBM] [Info] Using GPU Device: gfx1103, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 6 dense feature groups (23.29 MB) transferred to GPU in 0.028753 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.433137 -> initscore=-0.269062
[LightGBM] [Info] Start training from score -0.269062




Best Model Accuracy: 93.20%




In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from tqdm import tqdm  # Progress bar

# Load training data
df_train = pd.read_csv("Classifying_accidents-train.csv")

# Drop high-cardinality columns & unnecessary features
drop_cols = ["ID", "Zipcode", "Country", "Start_Lat", "Start_Lng", "End_Lat", "End_Lng", "Weather_Timestamp", 
             "Street", "City", "County", "Nautical_Twilight", "Astronomical_Twilight", "Turning_Loop", "Amenity",
             "Bump", "Give_Way", "No_Exit", "Railway", "Civil_Twilight", "Roundabout", "Traffic_Calming", "Timezone", "Airport_Code"]

df_train.drop(columns=drop_cols, inplace=True)

# Map target column
df_train["Class"] = df_train["Class"].map({"Source1": 0, "Source2": 1})

# Define feature sets
cat_features = ["State", "Wind_Direction", "Weather_Condition", 
                "Sunrise_Sunset", "Crossing",
                "Junction", "Station", "Stop", 
                "Traffic_Signal"]

num_features = ["Severity", "Distance(mi)", "Temperature(F)", "Humidity(%)", "Pressure(in)", 
                "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)"]

# Numerical transformation
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical transformation
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

# Combine transformers
preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
], n_jobs=-1)

# Train-test split
X_train = df_train.drop(columns=["Class"])
y_train = df_train["Class"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.001, random_state=42)

# Reduce dataset for initial training (optional for debugging)
X_train = X_train.sample(n=100000, random_state=42)
y_train = y_train.loc[X_train.index]

# Preprocess data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Define LightGBM model
model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, max_depth=-1, random_state=42, n_jobs=-1)

# Fit model with progress bar
with tqdm(total=len(X_train), desc="Training Progress", unit="sample") as pbar:
    model.fit(X_train_transformed, y_train)
    pbar.update(len(X_train))

# Evaluate model
accuracy = model.score(X_test_transformed, y_test) * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Load test data
df_test = pd.read_csv("Classifying_accidents - test.csv")
test_ids = df_test["ID"]
df_test.drop(columns=drop_cols, inplace=True)

# Preprocess test data
X_test_final = preprocessor.transform(df_test)

# Predict
test_preds = model.predict(X_test_final)

# Create submission file
submission = pd.DataFrame({"ID": test_ids, "Source": test_preds})
submission.to_csv("submission.csv", index=False)