#️⃣ CELL 1: 📘 Title + Description (Markdown)
"""
# 🧠 Industrial Waste Compliance - Data Preprocessing Notebook

This notebook performs:
1. **Dataset Loading & Inspection**
2. **Handling Missing Values**
3. **Encoding Categorical Features**
4. **Feature Scaling**
5. **Dimensionality Reduction (PCA)**
6. **Visualizations for Better Understanding**

Let's begin! 🚀
"""


In [None]:
#️⃣ CELL 2: STEP 1 - Load & Inspect the Dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/raw_data.csv")

print("🔹 Dataset Loaded Successfully!")
print(f"📦 Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")

print("🔸 Columns in Dataset:")
print(df.columns.tolist())

print("\n🔸 Data Types:")
print(df.dtypes)

print("\n🔹 First 5 Rows of Data:")
display(df.head())

print("\n🔸 Missing Values per Column:")
display(df.isnull().sum())
df1 = pd.read_csv("../data/selected_features.csv")


In [None]:
#️⃣ CELL 3: 🔍 EDA Visualization - Data Overview



# Distribution of numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols].hist(bins=20, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numerical Columns")
plt.show()

# Count plot for categorical columns
cat_cols = df.select_dtypes(include=['object', 'bool']).columns
for col in cat_cols:
    plt.figure(figsize=(6,3))
    sns.countplot(y=df[col], palette="coolwarm")
    plt.title(f"Count Plot - {col}")
    plt.show()


In [None]:
#️⃣ CELL 4: STEP 2 - Handle Missing Values

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'bool']).columns
num_cols1 = df1.select_dtypes(include=['int64', 'float64']).columns
cat_cols1 = df1.select_dtypes(include=['object', 'bool']).columns

print("🔹 Numerical Columns:", num_cols.tolist())
print("🔸 Categorical Columns:", cat_cols.tolist())

# Fill missing numerical values with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df1[num_cols1] = df1[num_cols1].fillna(df1[num_cols1].median())

# Fill missing categorical values with mode or 'Unknown'
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown", inplace=True)

for col in cat_cols1:
    if df1[col].isnull().sum() > 0:
        df1[col].fillna(df1[col].mode()[0] if not df1[col].mode().empty else "Unknown", inplace=True)

print("\n✅ Missing values after cleaning:")
display(df.isnull().sum())

print("\n🎯 All missing values handled successfully!")


In [None]:
#️⃣ CELL 5: STEP 3 - Encode Categorical Variables

from sklearn.preprocessing import LabelEncoder

# Identify categorical columns again
cat_cols = df.select_dtypes(include=['object', 'bool']).columns
cat_cols1 = df1.select_dtypes(include=['object', 'bool']).columns
print("🔸 Categorical Columns to Encode:", cat_cols.tolist())

# Initialize label encoder
le = LabelEncoder()

# Encode each categorical column
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))
for col in cat_cols1:
    df1[col] = le.fit_transform(df1[col].astype(str))

print("\n✅ Encoding completed successfully!")

print("\n🔹 Encoded Data Preview:")
display(df.head())

print("\n🔸 Data Types after Encoding:")
display(df.dtypes)

print(df.shape)

# Save cleaned dataset
# df.to_csv("../data/cleaned_compliance_data.csv", index=False)


In [None]:
#️⃣ CELL 6: 📈 Visualization - Correlation Heatmap

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap="viridis", annot=False)
plt.title("Correlation Heatmap - Encoded Data")
plt.show()
df = df1


In [None]:
#️⃣ CELL 7: STEP 4 - Feature Scaling + PCA

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

target_col = "Perfect_Waste_Decomposition_System"
X = df.drop(columns=[target_col])
y = df[target_col]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaled dataset before PCA
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled[target_col] = y.values
# df_scaled.to_csv("../data/cleaned_compliance_data_scaled.csv", index=False)
print("✅ Scaled dataset saved: cleaned_compliance_data_scaled.csv")

# Apply PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Create PCA DataFrame
pca_columns = [f"PCA_{i+1}" for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca[target_col] = y.values

# df_pca.to_csv("../data/cleaned_compliance_data_pca.csv", index=False)
print("✅ PCA dataset saved: cleaned_compliance_data_pca.csv")

print(f"🔹 Original Features: {X.shape[1]}")
print(f"🔸 PCA Components Retained: {X_pca.shape[1]}")
print(f"📦 Final PCA Dataset Shape: {df_pca.shape[0]} rows × {df_pca.shape[1]} columns")

display(df_pca.head())


In [None]:
#️⃣ CELL 8: 🧩 Visualization - PCA Explained Variance

plt.figure(figsize=(8,5))
plt.bar(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, color='steelblue')
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_.cumsum(), color='red', marker='o')
plt.xlabel("PCA Component")
plt.ylabel("Variance Explained")
plt.title("PCA Explained Variance Ratio")
plt.show()


In [None]:
#️⃣ CELL 9: 🌈 Visualization - PCA 2D Projection

plt.figure(figsize=(8,6))
sns.scatterplot(x=df_pca['PCA_1'], y=df_pca['PCA_2'], hue=df_pca[target_col], palette="cool")
plt.title("PCA - 2D Feature Space (First Two Components)")
plt.show()


#️⃣ CELL 10: ✅ Final Summary (Markdown)
"""
# ✅ Summary

- Missing values were filled using **median (numeric)** and **mode (categorical)**.
- Categorical variables were encoded using **Label Encoding**.
- Data was **standardized** with `StandardScaler`.
- PCA reduced dimensions while keeping **95% variance**.
- Several **visualizations** were created to understand:
  - Data distribution
  - Missing values
  - Correlations
  - PCA variance and projection

🚀 Preprocessing complete! Ready for model training.
"""
