# 🚀 Internship Task 1: ETL Pipeline for Pothole Detection Data
This notebook demonstrates an ETL (Extract-Transform-Load) pipeline using `pandas` and `scikit-learn` for a YOLOv10 model output dataset.

## 📥 Step 1: Extract — Load the Dataset

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('YOLOv10results.csv')

# Basic inspection
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

## 🔧 Step 2: Transform — Preprocess and Scale the Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Clean column names
df.columns = df.columns.str.strip()

# Define features and target
X = df.drop(columns=['epoch', 'metrics/mAP50(B)'])
y = df['metrics/mAP50(B)']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("[OK] Data transformed and scaled.")
print("X_train_scaled shape:", X_train_scaled.shape)

## 💾 Step 3: Load — Save the Transformed Data and Scaler

In [None]:
import joblib

# Convert arrays to DataFrames for CSV saving
X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Save datasets
X_train_df.to_csv('X_train_scaled.csv', index=False)
X_test_df.to_csv('X_test_scaled.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Save scaler
joblib.dump(scaler, 'scaler.pkl')

print("[OK] Files saved successfully.")