In [None]:
import pandas as pd
import numpy as np

# Load the CSV file (adjust the filename as needed)
df = pd.read_csv("accidents.csv")

# Display initial data snapshot and basic info
print("Initial data snapshot:")
print(df.head())
print("\nData info:")
print(df.info())

# --- Missing Values Handling ---
# For numeric columns, fill missing values with the median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"Filled missing values in numeric column '{col}' with median: {median_val}")

# For object (categorical) columns, fill missing values with a placeholder
object_cols = df.select_dtypes(include=['object']).columns
for col in object_cols:
    df[col].fillna("Unknown", inplace=True)
    print(f"Filled missing values in categorical column '{col}' with 'Unknown'")

# --- Boolean Conversion ---
# Convert any boolean columns to integers (True -> 1, False -> 0)
df = df.replace({True: 1, False: 0})

# --- Inspecting Columns ---
# Print unique values for each column to help decide on encoding
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].unique()}")

# --- Target Variable Handling ---
# Assume the target is the "Class" column; adjust if your target is different.
if 'Class' in df.columns:
    # Factorize the target column to convert categorical labels to numeric codes
    df['Class'] = pd.factorize(df['Class'])[0]
    target_column = 'Class'
else:
    target_column = None

# --- Feature Encoding ---
# Separate features and (optionally) the target column
if target_column:
    features = df.drop(columns=[target_column])
else:
    features = df.copy()

# Identify categorical features (object types) for one-hot encoding
categorical_features = features.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical features identified for encoding:", categorical_features)

# One-hot encode categorical features (drop_first=True avoids dummy variable trap)
features_encoded = pd.get_dummies(features, columns=categorical_features, drop_first=True)

# --- Reassemble Data ---
# If a target exists, add it back and ensure it is the last column
if target_column:
    features_encoded[target_column] = df[target_column]
    # Reorder columns so that the target is the last column
    cols = [col for col in features_encoded.columns if col != target_column] + [target_column]
    df_processed = features_encoded[cols]
else:
    df_processed = features_encoded

# Display the processed DataFrame
print("\nProcessed data preview:")
print(df_processed.head())

# Optionally, save the processed DataFrame to a new CSV file
df_processed.to_csv("accidents_processed.csv", index=False)


array([2, 1, 4, 3])

In [None]:
df.head()
