In [31]:
# STEP 1: Load & Inspect the Dataset

import pandas as pd

# 🧠 Load your dataset (make sure selected_features.csv is in the same folder)
df = pd.read_csv("../data/selected_features.csv")

# ✅ Basic Info
print("🔹 Dataset Loaded Successfully!")
print(f"📦 Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")

# 🧾 Show column names
print("🔸 Columns in Dataset:")
display(df.columns.tolist())

# 📊 Data types
print("\n🔸 Data Types:")
display(df.dtypes)

# 👀 Preview first 5 rows
print("\n🔹 First 5 Rows of Data:")
display(df.head())

# 🧩 Check for missing values
print("\n🔸 Missing Values per Column:")
display(df.isnull().sum())


🔹 Dataset Loaded Successfully!
📦 Shape: 2000 rows × 16 columns

🔸 Columns in Dataset:


['Waste_Segregation_Level_%',
 'Decomposition_Efficiency_%',
 'Recycling_Rate_%',
 'Hazardous_Waste_Treatment_Compliance',
 'Govt_Compliance_Status',
 'ISO_14001_Certified',
 'Zero_Waste_Landfill_Certified',
 'Waste_to_Energy_Usage',
 'CO2_Emissions_tons_per_year',
 'Methane_Emissions_tons_per_year',
 'Environmental_Fines_Count',
 'Employee_Training_in_Waste_Management',
 'Digital_Waste_Tracking',
 'Frequency_of_Waste_Audit_per_year',
 'Eco_Friendly_Raw_Materials_%',
 'Perfect_Waste_Decomposition_System']


🔸 Data Types:


Waste_Segregation_Level_%                float64
Decomposition_Efficiency_%               float64
Recycling_Rate_%                         float64
Hazardous_Waste_Treatment_Compliance      object
Govt_Compliance_Status                    object
ISO_14001_Certified                       object
Zero_Waste_Landfill_Certified             object
Waste_to_Energy_Usage                     object
CO2_Emissions_tons_per_year              float64
Methane_Emissions_tons_per_year          float64
Environmental_Fines_Count                float64
Employee_Training_in_Waste_Management     object
Digital_Waste_Tracking                    object
Frequency_of_Waste_Audit_per_year        float64
Eco_Friendly_Raw_Materials_%             float64
Perfect_Waste_Decomposition_System         int64
dtype: object


🔹 First 5 Rows of Data:


Unnamed: 0,Waste_Segregation_Level_%,Decomposition_Efficiency_%,Recycling_Rate_%,Hazardous_Waste_Treatment_Compliance,Govt_Compliance_Status,ISO_14001_Certified,Zero_Waste_Landfill_Certified,Waste_to_Energy_Usage,CO2_Emissions_tons_per_year,Methane_Emissions_tons_per_year,Environmental_Fines_Count,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,Perfect_Waste_Decomposition_System
0,69.4382,87.24546,82.116349,Yes,Compliant,Yes,Yes,Yes,87.714154,29.319789,0.0,Yes,Yes,10.709578,90.038123,1
1,71.554375,52.160461,26.127419,No,Non-Compliant,No,Yes,No,332.183476,258.573055,1.000129,Yes,Yes,3.599928,24.953132,0
2,55.167802,40.387312,14.138098,No,Pending,Yes,Yes,Yes,591.3803,225.735466,9.071733,No,Yes,4.21748,18.029772,0
3,68.368581,58.358841,47.014491,Yes,Non-Compliant,Yes,Yes,Yes,,142.7943,4.216982,Yes,No,3.900029,24.640033,0
4,79.004505,87.941618,86.535538,Yes,Compliant,Yes,Yes,Yes,90.123255,19.584022,0.0,Yes,No,11.31857,66.948111,1



🔸 Missing Values per Column:


Waste_Segregation_Level_%                40
Decomposition_Efficiency_%               40
Recycling_Rate_%                         40
Hazardous_Waste_Treatment_Compliance     40
Govt_Compliance_Status                   40
ISO_14001_Certified                      40
Zero_Waste_Landfill_Certified            40
Waste_to_Energy_Usage                    40
CO2_Emissions_tons_per_year              40
Methane_Emissions_tons_per_year          40
Environmental_Fines_Count                40
Employee_Training_in_Waste_Management    40
Digital_Waste_Tracking                   40
Frequency_of_Waste_Audit_per_year        40
Eco_Friendly_Raw_Materials_%             40
Perfect_Waste_Decomposition_System        0
dtype: int64

In [32]:
# STEP 2: Handle Missing Values

# 1️⃣ Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'bool']).columns

print("🔹 Numerical Columns:", num_cols.tolist())
print("🔸 Categorical Columns:", cat_cols.tolist())

# 2️⃣ Fill missing numerical values with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# 3️⃣ Fill missing categorical values with mode (most frequent) or 'Unknown'
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown", inplace=True)

# 4️⃣ Check again for missing values
print("\n✅ Missing values after cleaning:")
display(df.isnull().sum())

# 5️⃣ Optional: show confirmation
print("\n🎯 All missing values handled successfully!")


🔹 Numerical Columns: ['Waste_Segregation_Level_%', 'Decomposition_Efficiency_%', 'Recycling_Rate_%', 'CO2_Emissions_tons_per_year', 'Methane_Emissions_tons_per_year', 'Environmental_Fines_Count', 'Frequency_of_Waste_Audit_per_year', 'Eco_Friendly_Raw_Materials_%', 'Perfect_Waste_Decomposition_System']
🔸 Categorical Columns: ['Hazardous_Waste_Treatment_Compliance', 'Govt_Compliance_Status', 'ISO_14001_Certified', 'Zero_Waste_Landfill_Certified', 'Waste_to_Energy_Usage', 'Employee_Training_in_Waste_Management', 'Digital_Waste_Tracking']

✅ Missing values after cleaning:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown", inplace=True)


Waste_Segregation_Level_%                0
Decomposition_Efficiency_%               0
Recycling_Rate_%                         0
Hazardous_Waste_Treatment_Compliance     0
Govt_Compliance_Status                   0
ISO_14001_Certified                      0
Zero_Waste_Landfill_Certified            0
Waste_to_Energy_Usage                    0
CO2_Emissions_tons_per_year              0
Methane_Emissions_tons_per_year          0
Environmental_Fines_Count                0
Employee_Training_in_Waste_Management    0
Digital_Waste_Tracking                   0
Frequency_of_Waste_Audit_per_year        0
Eco_Friendly_Raw_Materials_%             0
Perfect_Waste_Decomposition_System       0
dtype: int64


🎯 All missing values handled successfully!


In [33]:
# STEP 3: Encode Categorical Variables

from sklearn.preprocessing import LabelEncoder

display(df.head())


# 1️⃣ Identify categorical columns again
cat_cols = df.select_dtypes(include=['object', 'bool']).columns
print("🔸 Categorical Columns to Encode:")
display(cat_cols)

# 2️⃣ Initialize label encoder
le = LabelEncoder()

# 3️⃣ Encode each categorical column
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("\n✅ Encoding completed successfully!")

# 4️⃣ Verify encoding result
print("\n🔹 Encoded Data Preview:")
display(df.head())

# 5️⃣ Check data types after encoding
print("\n🔸 Data Types after Encoding:")
display(df.dtypes)
print(df.shape)

df.to_csv("../data/cleaned_compliance_data.csv", index=False)


Unnamed: 0,Waste_Segregation_Level_%,Decomposition_Efficiency_%,Recycling_Rate_%,Hazardous_Waste_Treatment_Compliance,Govt_Compliance_Status,ISO_14001_Certified,Zero_Waste_Landfill_Certified,Waste_to_Energy_Usage,CO2_Emissions_tons_per_year,Methane_Emissions_tons_per_year,Environmental_Fines_Count,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,Perfect_Waste_Decomposition_System
0,69.4382,87.24546,82.116349,Yes,Compliant,Yes,Yes,Yes,87.714154,29.319789,0.0,Yes,Yes,10.709578,90.038123,1
1,71.554375,52.160461,26.127419,No,Non-Compliant,No,Yes,No,332.183476,258.573055,1.000129,Yes,Yes,3.599928,24.953132,0
2,55.167802,40.387312,14.138098,No,Pending,Yes,Yes,Yes,591.3803,225.735466,9.071733,No,Yes,4.21748,18.029772,0
3,68.368581,58.358841,47.014491,Yes,Non-Compliant,Yes,Yes,Yes,281.603205,142.7943,4.216982,Yes,No,3.900029,24.640033,0
4,79.004505,87.941618,86.535538,Yes,Compliant,Yes,Yes,Yes,90.123255,19.584022,0.0,Yes,No,11.31857,66.948111,1


🔸 Categorical Columns to Encode:


Index(['Hazardous_Waste_Treatment_Compliance', 'Govt_Compliance_Status',
       'ISO_14001_Certified', 'Zero_Waste_Landfill_Certified',
       'Waste_to_Energy_Usage', 'Employee_Training_in_Waste_Management',
       'Digital_Waste_Tracking'],
      dtype='object')


✅ Encoding completed successfully!

🔹 Encoded Data Preview:


Unnamed: 0,Waste_Segregation_Level_%,Decomposition_Efficiency_%,Recycling_Rate_%,Hazardous_Waste_Treatment_Compliance,Govt_Compliance_Status,ISO_14001_Certified,Zero_Waste_Landfill_Certified,Waste_to_Energy_Usage,CO2_Emissions_tons_per_year,Methane_Emissions_tons_per_year,Environmental_Fines_Count,Employee_Training_in_Waste_Management,Digital_Waste_Tracking,Frequency_of_Waste_Audit_per_year,Eco_Friendly_Raw_Materials_%,Perfect_Waste_Decomposition_System
0,69.4382,87.24546,82.116349,1,0,1,1,1,87.714154,29.319789,0.0,1,1,10.709578,90.038123,1
1,71.554375,52.160461,26.127419,0,1,0,1,0,332.183476,258.573055,1.000129,1,1,3.599928,24.953132,0
2,55.167802,40.387312,14.138098,0,2,1,1,1,591.3803,225.735466,9.071733,0,1,4.21748,18.029772,0
3,68.368581,58.358841,47.014491,1,1,1,1,1,281.603205,142.7943,4.216982,1,0,3.900029,24.640033,0
4,79.004505,87.941618,86.535538,1,0,1,1,1,90.123255,19.584022,0.0,1,0,11.31857,66.948111,1



🔸 Data Types after Encoding:


Waste_Segregation_Level_%                float64
Decomposition_Efficiency_%               float64
Recycling_Rate_%                         float64
Hazardous_Waste_Treatment_Compliance       int64
Govt_Compliance_Status                     int64
ISO_14001_Certified                        int64
Zero_Waste_Landfill_Certified              int64
Waste_to_Energy_Usage                      int64
CO2_Emissions_tons_per_year              float64
Methane_Emissions_tons_per_year          float64
Environmental_Fines_Count                float64
Employee_Training_in_Waste_Management      int64
Digital_Waste_Tracking                     int64
Frequency_of_Waste_Audit_per_year        float64
Eco_Friendly_Raw_Materials_%             float64
Perfect_Waste_Decomposition_System         int64
dtype: object

(2000, 16)


In [34]:
# # STEP 4: Feature Scaling + PCA

# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# import pandas as pd

# # 🧠 Separate target column before scaling
# target_col = "Perfect_Waste_Decomposition_System"
# X = df.drop(columns=[target_col])
# y = df[target_col]

# # 1️⃣ Scale the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # 1a️⃣ Save scaled features + target before PCA
# df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# df_scaled[target_col] = y.values
# df_scaled.to_csv("../data/cleaned_compliance_data_scaled.csv", index=False)
# print("✅ Scaled dataset saved: cleaned_compliance_data_scaled.csv")

# # 2️⃣ Apply PCA - keep 95% variance
# pca = PCA(n_components=0.95, random_state=42)
# X_pca = pca.fit_transform(X_scaled)

# # 3️⃣ Create a new DataFrame for PCA-transformed features
# pca_columns = [f"PCA_{i+1}" for i in range(X_pca.shape[1])]
# df_pca = pd.DataFrame(X_pca, columns=pca_columns)

# # 4️⃣ Add back the target column
# df_pca[target_col] = y.values

# # 5️⃣ Save the PCA dataset
# df_pca.to_csv("../data/cleaned_compliance_data_pca.csv", index=False)
# print("✅ PCA dataset saved: cleaned_compliance_data_pca.csv")

# # 6️⃣ Display summary
# print(f"🔹 Original Features: {X.shape[1]}")
# print(f"🔸 PCA Components Retained: {X_pca.shape[1]}")
# display(df_pca.head())
# print(f"📦 Final PCA Dataset Shape: {df_pca.shape[0]} rows × {df_pca.shape[1]} columns")


In [35]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(8,5))
# plt.bar(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
# plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_.cumsum(), color='red', marker='o')
# plt.xlabel("PCA Component")
# plt.ylabel("Variance Explained")
# plt.title("PCA Explained Variance Ratio")
# plt.show()
