Import necessary libraries

In [31]:
import pandas as pd
import numpy as np
import os


 Load the dataset

In [32]:
data_path = r"D:\Portfolio\waste_management\data\raw\waste_data.csv"  # Update path as needed
df = pd.read_csv(data_path)
print("Data loaded successfully.")
df.head()


Data loaded successfully.


Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,"Landfill Location (Lat, Long)",Landfill Capacity (Tons),Year
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,"22.4265, 77.4931",45575,2019
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,"22.4265, 77.4931",45575,2019
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,"22.4265, 77.4931",45575,2019


Inspect data info and check for missing values

In [33]:
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   City/District                      850 non-null    object
 1   Waste Type                         850 non-null    object
 2   Waste Generated (Tons/Day)         850 non-null    int64 
 3   Recycling Rate (%)                 850 non-null    int64 
 4   Population Density (People/km²)    850 non-null    int64 
 5   Municipal Efficiency Score (1-10)  850 non-null    int64 
 6   Disposal Method                    850 non-null    object
 7   Cost of Waste Management (₹/Ton)   850 non-null    int64 
 8   Awareness Campaigns Count          850 non-null    int64 
 9   Landfill Name                      850 non-null    object
 10  Landfill Location (Lat, Long)      850 non-null    object
 11  Landfill Capacity (Tons)           850 non-null    int64 
 12  Year    

Extract landfill latitude and longitude as separate columns

In [34]:
df['Landfill_Lat'] = df['Landfill Location (Lat, Long)'].str.split(',').str[0].astype(float)
df['Landfill_Long'] = df['Landfill Location (Lat, Long)'].str.split(',').str[1].astype(float)
df.drop(columns=['Landfill Location (Lat, Long)'], inplace=True)
df.head()


Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,Landfill Capacity (Tons),Year,Landfill_Lat,Landfill_Long
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,45575,2019,22.4265,77.4931
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,45575,2019,22.4265,77.4931
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,45575,2019,22.4265,77.4931
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,45575,2019,22.4265,77.4931
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,45575,2019,22.4265,77.4931


Feature engineering - create new numeric features

In [35]:
# Waste per capita (kg/person/day)
df['Waste_Per_Capita_kg'] = 1000 * df['Waste Generated (Tons/Day)'] / df['Population Density (People/km²)']

# Landfill utilization ratio
df['Landfill_Utilization_Ratio'] = df['Waste Generated (Tons/Day)'] / df['Landfill Capacity (Tons)']

# Cost per Awareness Campaign (replace zero campaign count to avoid division by zero)
df['Cost_Per_Campaign'] = df['Cost of Waste Management (₹/Ton)'] / df['Awareness Campaigns Count'].replace(0, np.nan)

df.head()


Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,Landfill Capacity (Tons),Year,Landfill_Lat,Landfill_Long,Waste_Per_Capita_kg,Landfill_Utilization_Ratio,Cost_Per_Campaign
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,45575,2019,22.4265,77.4931,590.653203,0.145036,218.285714
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,45575,2019,22.4265,77.4931,105.53123,0.025913,231.5
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,45575,2019,22.4265,77.4931,729.336074,0.179089,260.769231
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,45575,2019,22.4265,77.4931,797.873291,0.195919,107.0
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,45575,2019,22.4265,77.4931,449.647038,0.110411,138.8125


 Save cleaned non-encoded data for exploratory analysis

In [39]:
os.makedirs('data/processed', exist_ok=True)
cleaned_data_path = 'D:\Portfolio\waste_management\data\processed\waste_data_cleaned.csv'
df.to_csv(cleaned_data_path, index=False)
print(f"Cleaned data saved to {cleaned_data_path}")


Cleaned data saved to D:\Portfolio\waste_management\data\processed\waste_data_cleaned.csv


  cleaned_data_path = 'D:\Portfolio\waste_management\data\processed\waste_data_cleaned.csv'


One-hot encode categorical columns for modeling

In [37]:
categorical_cols = ['City/District', 'Waste Type', 'Disposal Method', 'Landfill Name']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()


Unnamed: 0,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year,Landfill_Lat,Landfill_Long,...,Landfill Name_Nashik Landfill,Landfill Name_Patna Landfill,Landfill Name_Pune Landfill,Landfill Name_Rajkot Landfill,Landfill Name_Ranchi Landfill,Landfill Name_Surat Landfill,Landfill Name_Thiruvananthapuram Landfill,Landfill Name_Vadodara Landfill,Landfill Name_Varanasi Landfill,Landfill Name_Visakhapatnam Landfill
0,6610,68,11191,9,3056,14,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,False,False
1,1181,56,11191,5,2778,12,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,False,False
2,8162,53,11191,8,3390,13,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,False,False
3,8929,56,11191,5,1498,14,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,False,False
4,5032,44,11191,7,2221,16,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,False,False


Save encoded processed data

In [40]:
processed_data_path = 'D:\Portfolio\waste_management\data\processed\waste_data_processed.csv'
df_encoded.to_csv(processed_data_path, index=False)
print(f"Encoded data saved to {processed_data_path}")


Encoded data saved to D:\Portfolio\waste_management\data\processed\waste_data_processed.csv


  processed_data_path = 'D:\Portfolio\waste_management\data\processed\waste_data_processed.csv'
