# 🧹 Waste Dataset Preprocessing Notebook
This notebook loads the **raw dataset** and performs preprocessing steps:
- Map Yes/No to 1/0 for binary columns
- Label encode categorical features
- Scale numeric features (optional)
- Save the processed dataset

Finally, it prints a **clean summary** for verification.

In [16]:

import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler


## 📂 Load Raw Dataset

In [17]:

input_path = "../data/raw_data.csv"
df = pd.read_csv(input_path)
print(f"✅ Raw dataset loaded from: {input_path}")
print(f"Shape: {df.shape}")
display(df.head())


✅ Raw dataset loaded from: ../data/raw_data.csv
Shape: (2000, 29)


Unnamed: 0,Company_ID,Industry_Type,Company_Size,Location_Region,Total_Waste_Generated_kg_per_month,Biodegradable_Waste_%,Recyclable_Waste_%,Hazardous_Waste_%,Waste_Segregation_Level_%,Decomposition_Technique,...,Methane_Emissions_tons_per_year,Water_Pollution_Index,Soil_Pollution_Index,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,CSR_Initiatives_on_Waste,Public_Sustainability_Reports,Perfect_Waste_Decomposition_System
0,C0766,IT,Large,Semi-Urban,35304.97,36,41,23,77,Recycling,...,15.27,22,1,Yes,Yes,12,82,Yes,Yes,1
1,C0674,Food,Large,Urban,38969.64,64,33,3,76,Incineration,...,48.49,21,11,Yes,Yes,8,85,Yes,Yes,1
2,C0826,Manufacturing,Large,Semi-Urban,44949.69,34,29,37,68,Landfill,...,47.29,27,17,Yes,Yes,7,84,Yes,Yes,1
3,C0699,Textile,Medium,Semi-Urban,9272.5,27,46,27,98,Composting,...,36.3,29,19,Yes,Yes,9,60,Yes,Yes,1
4,C1247,Food,Medium,Urban,5582.05,47,48,5,21,Landfill,...,216.52,65,63,Yes,No,5,37,Yes,No,0


## 🔄 Step 1: Map Yes/No to 1/0 for Binary Columns

In [18]:

binary_columns = [
    "Hazardous_Waste_Treatment_Compliance", "Waste_to_Energy_Usage",
    "ISO_14001_Certified", "Zero_Waste_Landfill_Certified",
    "Employee_Training_in_Waste_Management", "Digital_Waste_Tracking",
    "CSR_Initiatives_on_Waste", "Public_Sustainability_Reports"
]

for col in binary_columns:
    df[col] = df[col].map({"Yes": 1, "No": 0})

print("✅ Binary columns converted to 1/0")


✅ Binary columns converted to 1/0


## 🏷️ Step 2: Label Encode Categorical Columns

In [19]:

categorical_cols = [
    "Industry_Type", "Company_Size", "Location_Region",
    "Decomposition_Technique", "Govt_Compliance_Status"
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder if needed later

print("✅ Categorical columns label-encoded")
print("Encoded columns:", categorical_cols)


✅ Categorical columns label-encoded
Encoded columns: ['Industry_Type', 'Company_Size', 'Location_Region', 'Decomposition_Technique', 'Govt_Compliance_Status']


## 📏 Step 3: Identify Numeric Features

In [20]:

target_col = "Perfect_Waste_Decomposition_System"
exclude_cols = [target_col, "Company_ID"] + binary_columns + categorical_cols
numeric_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]

print("🔹 Numeric columns to be scaled (if needed):")
print(numeric_cols)


🔹 Numeric columns to be scaled (if needed):
['Total_Waste_Generated_kg_per_month', 'Biodegradable_Waste_%', 'Recyclable_Waste_%', 'Hazardous_Waste_%', 'Waste_Segregation_Level_%', 'Decomposition_Efficiency_%', 'Recycling_Rate_%', 'Environmental_Fines_Count', 'CO2_Emissions_tons_per_year', 'Methane_Emissions_tons_per_year', 'Water_Pollution_Index', 'Soil_Pollution_Index', 'Frequency_of_Waste_Audit_per_year', 'Eco_Friendly_Raw_Materials_%']


In [21]:
print(df.dtypes)


Company_ID                                object
Industry_Type                              int64
Company_Size                               int64
Location_Region                            int64
Total_Waste_Generated_kg_per_month       float64
Biodegradable_Waste_%                      int64
Recyclable_Waste_%                         int64
Hazardous_Waste_%                          int64
Waste_Segregation_Level_%                  int64
Decomposition_Technique                    int64
Decomposition_Efficiency_%                 int64
Recycling_Rate_%                           int64
Hazardous_Waste_Treatment_Compliance       int64
Waste_to_Energy_Usage                      int64
ISO_14001_Certified                        int64
Zero_Waste_Landfill_Certified              int64
Govt_Compliance_Status                     int64
Environmental_Fines_Count                  int64
CO2_Emissions_tons_per_year              float64
Methane_Emissions_tons_per_year          float64
Water_Pollution_Inde

In [22]:
print(df.isnull().sum())


Company_ID                               0
Industry_Type                            0
Company_Size                             0
Location_Region                          0
Total_Waste_Generated_kg_per_month       0
Biodegradable_Waste_%                    0
Recyclable_Waste_%                       0
Hazardous_Waste_%                        0
Waste_Segregation_Level_%                0
Decomposition_Technique                  0
Decomposition_Efficiency_%               0
Recycling_Rate_%                         0
Hazardous_Waste_Treatment_Compliance     0
Waste_to_Energy_Usage                    0
ISO_14001_Certified                      0
Zero_Waste_Landfill_Certified            0
Govt_Compliance_Status                   0
Environmental_Fines_Count                0
CO2_Emissions_tons_per_year              0
Methane_Emissions_tons_per_year          0
Water_Pollution_Index                    0
Soil_Pollution_Index                     0
Employee_Training_in_Waste_Management    0
Digital_Was

## 💾 Step 4: Save Processed Dataset

In [23]:

os.makedirs("../data", exist_ok=True)
output_path = "../data/processed_data.csv"
df.to_csv(output_path, index=False)

print("="*60)
print("🎯 Preprocessing Complete!")
print("="*60)
print(f"✅ Processed dataset saved at: {output_path}")
print(f"🔹 Final Shape: {df.shape}")
print("\n🔸 Target Distribution:")
print(df[target_col].value_counts().to_string())
print("\n🔸 Sample Records:")
display(df.head())
print("="*60)


🎯 Preprocessing Complete!
✅ Processed dataset saved at: ../data/processed_data.csv
🔹 Final Shape: (2000, 29)

🔸 Target Distribution:
Perfect_Waste_Decomposition_System
1    1000
0    1000

🔸 Sample Records:


Unnamed: 0,Company_ID,Industry_Type,Company_Size,Location_Region,Total_Waste_Generated_kg_per_month,Biodegradable_Waste_%,Recyclable_Waste_%,Hazardous_Waste_%,Waste_Segregation_Level_%,Decomposition_Technique,...,Methane_Emissions_tons_per_year,Water_Pollution_Index,Soil_Pollution_Index,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,CSR_Initiatives_on_Waste,Public_Sustainability_Reports,Perfect_Waste_Decomposition_System
0,C0766,3,0,1,35304.97,36,41,23,77,5,...,15.27,22,1,1,1,12,82,1,1,1
1,C0674,2,0,2,38969.64,64,33,3,76,2,...,48.49,21,11,1,1,8,85,1,1,1
2,C0826,4,0,1,44949.69,34,29,37,68,3,...,47.29,27,17,1,1,7,84,1,1,1
3,C0699,6,1,1,9272.5,27,46,27,98,1,...,36.3,29,19,1,1,9,60,1,1,1
4,C1247,2,1,2,5582.05,47,48,5,21,3,...,216.52,65,63,1,0,5,37,1,0,0


