# Data Cleaning

________________________________________
### 1. Initialize libraries and load dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Load dataset excluding the modified columns
df = pd.read_excel(
    "Dirty_Dataset_with_Log.xlsx",
    usecols=lambda col: col not in ["DIY"]
)

print("Dataset loaded successfully.")
display(df.head())

# Dataset shape
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}\n")

_______________
### 2. Find Redundant Data based on the same values in the row and remove

In [None]:
# Identify FULL duplicates (entire row is identical)
full_duplicates = df[df.duplicated(keep=False)]

print("\nFULL redundant rows:")
display(full_duplicates)

# Remove only TRUE duplicates (entire row same)
df_missing = df.drop_duplicates(keep="first").reset_index(drop=True)

print("\nAfter removing redundant rows:", df_missing.shape)

_____________
### 3. Missing Values Summary

In [None]:
missing_rows = df_missing[df_missing.isna().any(axis=1)].copy()

# Identify which columns are missing in each row
missing_rows["missing_columns"] = missing_rows.apply(
    lambda row: [col for col in df_missing.columns if pd.isna(row[col])],
    axis=1
)

# Show only key columns
missing_info = missing_rows[["vehicle_id", "timestamp", "missing_columns"]]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(missing_info)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

total_missing = df_missing.isna().sum().sum()
print("Total missing values in cleaned dataset:", total_missing)

_______________
# Fill in Missing Values using KNN imputations

In [None]:
from sklearn.impute import KNNImputer

int_cols = [
    "lidar_points",
    "radar_objects",
    "camera_objects",
    "latency_ms",
    "throughput_kbps",
    "collision_detected"
]

float_cols = [
    "gps_latitude",
    "gps_longitude",
    "packet_drop_rate",
    "packet_delivery_ratio",
    "obstacle_detection_accuracy",
    "decision_accuracy"
]

# Combine for KNN processing
numeric_cols = int_cols + float_cols

print("Integer columns:", int_cols)
print("Float columns:", float_cols)

# Create a copy for KNN
df_knn = df_missing.copy()

# --- Run KNN ---
imputer = KNNImputer(n_neighbors=5)

df_knn[numeric_cols] = imputer.fit_transform(df_knn[numeric_cols])

# --- Restore the correct data types ---

# Convert integer columns back to whole numbers
for col in int_cols:
    df_knn[col] = df_knn[col].round().astype(int)

# Float columns remain float (no change needed)

# Save final dataset
df_knn.to_csv("Cleaned_Dataset.csv", index=False)
print("KNN-filled dataset saved as Cleaned_Dataset.csv.")


# Write report of the filled value

In [None]:
# 1. Identify missing locations BEFORE imputation
missing_locs = []
for row in df_missing.index:
    for col in numeric_cols:
        if pd.isna(df_missing.loc[row, col]):
            missing_locs.append((row, col))

# 2. Create a results table
results = []

for row, col in missing_locs:
    filled = df_knn.loc[row, col]   # value after KNN imputation
    
    results.append({
        "row_index": row,
        "vehicle_id": df_missing.loc[row, "vehicle_id"],
        "timestamp": df_missing.loc[row, "timestamp"],
        "column_imputed": col,
        "filled_value": filled
    })

# 3. Convert to DataFrame for display
imputation_report = pd.DataFrame(results)

print("KNN Imputation Report (Before vs After):")
display(imputation_report)

# Save report
imputation_report.to_csv("KNN_Imputed_Values_Report.csv", index=False)
print("Saved: KNN_Imputed_Values_Report.csv")


____________
# Information of Dataset

In [None]:
print("Cleaned Dataset Info:")
df_knn.info()

print("\nSummary Statistics:")
display(df_knn.describe())