In [10]:

import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [20]:

input_path = "../data/raw_data.csv"
df = pd.read_csv(input_path)
print(f"Raw dataset loaded from: {input_path}")
print(f"Shape: {df.shape}")
display(df.head())


Raw dataset loaded from: ../data/raw_data.csv
Shape: (2000, 29)


Unnamed: 0,Company_ID,Industry_Type,Company_Size,Location_Region,Total_Waste_Generated_kg_per_month,Biodegradable_Waste_%,Recyclable_Waste_%,Hazardous_Waste_%,Waste_Segregation_Level_%,Decomposition_Technique,...,Methane_Emissions_tons_per_year,Water_Pollution_Index,Soil_Pollution_Index,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,CSR_Initiatives_on_Waste,Public_Sustainability_Reports,Perfect_Waste_Decomposition_System
0,C0640,Pharma,Medium,Rural,17332.66,42,48,10,67,Mixed,...,28.83,2,25,Yes,Yes,11,86,Yes,Yes,1
1,C1460,Textile,Medium,Rural,15774.52,20,40,40,70,Landfill,...,253.86,97,82,Yes,Yes,4,30,Yes,Yes,0
2,C1839,Food,Small,Rural,3487.97,45,35,20,57,Composting,...,226.16,99,56,No,Yes,4,17,No,No,0
3,C1646,Automobile,Small,Semi-Urban,520.2,12,51,37,70,Incineration,...,138.41,50,82,Yes,No,4,24,Yes,Yes,0
4,C0149,Food,Small,Rural,1861.76,62,37,1,83,Anaerobic Digestion,...,20.16,20,3,Yes,Yes,11,68,Yes,Yes,1


Step 1: Map Yes/No to 1/0 for Binary Columns

In [12]:

binary_columns = [
    "Hazardous_Waste_Treatment_Compliance", "Waste_to_Energy_Usage",
    "ISO_14001_Certified", "Zero_Waste_Landfill_Certified",
    "Employee_Training_in_Waste_Management", "Digital_Waste_Tracking",
    "CSR_Initiatives_on_Waste", "Public_Sustainability_Reports"
]

for col in binary_columns:
    df[col] = df[col].map({"Yes": 1, "No": 0})

print(" Binary columns converted to 1/0")


 Binary columns converted to 1/0


  Step 2: Label Encode Categorical Columns

In [21]:

categorical_cols = [
    "Industry_Type", "Company_Size", "Location_Region",
    "Decomposition_Technique", "Govt_Compliance_Status"
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder if needed later

print("Categorical columns label-encoded")
print("Encoded columns:", categorical_cols)
print(label_encoders)


Categorical columns label-encoded
Encoded columns: ['Industry_Type', 'Company_Size', 'Location_Region', 'Decomposition_Technique', 'Govt_Compliance_Status']
{'Industry_Type': LabelEncoder(), 'Company_Size': LabelEncoder(), 'Location_Region': LabelEncoder(), 'Decomposition_Technique': LabelEncoder(), 'Govt_Compliance_Status': LabelEncoder()}


 Step 3: Identify Numeric Features

In [14]:

target_col = "Perfect_Waste_Decomposition_System"
exclude_cols = [target_col, "Company_ID"] + binary_columns + categorical_cols
numeric_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]

print(" Numeric columns to be scaled (if needed):")
print(numeric_cols)


 Numeric columns to be scaled (if needed):
['Total_Waste_Generated_kg_per_month', 'Biodegradable_Waste_%', 'Recyclable_Waste_%', 'Hazardous_Waste_%', 'Waste_Segregation_Level_%', 'Decomposition_Efficiency_%', 'Recycling_Rate_%', 'Environmental_Fines_Count', 'CO2_Emissions_tons_per_year', 'Methane_Emissions_tons_per_year', 'Water_Pollution_Index', 'Soil_Pollution_Index', 'Frequency_of_Waste_Audit_per_year', 'Eco_Friendly_Raw_Materials_%']


In [15]:
print(df.dtypes)


Company_ID                                object
Industry_Type                              int64
Company_Size                               int64
Location_Region                            int64
Total_Waste_Generated_kg_per_month       float64
Biodegradable_Waste_%                      int64
Recyclable_Waste_%                         int64
Hazardous_Waste_%                          int64
Waste_Segregation_Level_%                  int64
Decomposition_Technique                    int64
Decomposition_Efficiency_%                 int64
Recycling_Rate_%                           int64
Hazardous_Waste_Treatment_Compliance       int64
Waste_to_Energy_Usage                      int64
ISO_14001_Certified                        int64
Zero_Waste_Landfill_Certified              int64
Govt_Compliance_Status                     int64
Environmental_Fines_Count                  int64
CO2_Emissions_tons_per_year              float64
Methane_Emissions_tons_per_year          float64
Water_Pollution_Inde

In [16]:
print(df.isnull().sum())


Company_ID                               0
Industry_Type                            0
Company_Size                             0
Location_Region                          0
Total_Waste_Generated_kg_per_month       0
Biodegradable_Waste_%                    0
Recyclable_Waste_%                       0
Hazardous_Waste_%                        0
Waste_Segregation_Level_%                0
Decomposition_Technique                  0
Decomposition_Efficiency_%               0
Recycling_Rate_%                         0
Hazardous_Waste_Treatment_Compliance     0
Waste_to_Energy_Usage                    0
ISO_14001_Certified                      0
Zero_Waste_Landfill_Certified            0
Govt_Compliance_Status                   0
Environmental_Fines_Count                0
CO2_Emissions_tons_per_year              0
Methane_Emissions_tons_per_year          0
Water_Pollution_Index                    0
Soil_Pollution_Index                     0
Employee_Training_in_Waste_Management    0
Digital_Was

 Step 4: Save Processed Dataset

In [17]:

os.makedirs("../data", exist_ok=True)
output_path = "../data/processed_data.csv"
df.to_csv(output_path, index=False)

print("="*60)
print(" Preprocessing Complete!")
print("="*60)
print(f" Processed dataset saved at: {output_path}")
print(f" Final Shape: {df.shape}")
print("\n Target Distribution:")
print(df[target_col].value_counts().to_string())
print("\n Sample Records:")
display(df.head())
print("="*60)


 Preprocessing Complete!
 Processed dataset saved at: ../data/processed_data.csv
 Final Shape: (2000, 29)

 Target Distribution:
Perfect_Waste_Decomposition_System
1    1000
0    1000

 Sample Records:


Unnamed: 0,Company_ID,Industry_Type,Company_Size,Location_Region,Total_Waste_Generated_kg_per_month,Biodegradable_Waste_%,Recyclable_Waste_%,Hazardous_Waste_%,Waste_Segregation_Level_%,Decomposition_Technique,...,Methane_Emissions_tons_per_year,Water_Pollution_Index,Soil_Pollution_Index,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,CSR_Initiatives_on_Waste,Public_Sustainability_Reports,Perfect_Waste_Decomposition_System
0,C0640,5,1,0,17332.66,42,48,10,67,4,...,28.83,2,25,1,1,11,86,1,1,1
1,C1460,6,1,0,15774.52,20,40,40,70,3,...,253.86,97,82,1,1,4,30,1,1,0
2,C1839,2,2,0,3487.97,45,35,20,57,1,...,226.16,99,56,0,1,4,17,0,0,0
3,C1646,0,2,1,520.2,12,51,37,70,2,...,138.41,50,82,1,0,4,24,1,1,0
4,C0149,2,2,0,1861.76,62,37,1,83,0,...,20.16,20,3,1,1,11,68,1,1,1


