Import Libraries and Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"D:\Portfolio\waste_management\data\processed\waste_data_cleaned.csv")
print("Loaded cleaned data:", df.shape)
df.head()


Loaded cleaned data: (850, 17)


Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,Landfill Capacity (Tons),Year,Landfill_Lat,Landfill_Long,Waste_Per_Capita_kg,Landfill_Utilization_Ratio,Cost_Per_Campaign
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,45575,2019,22.4265,77.4931,590.653203,0.145036,218.285714
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,45575,2019,22.4265,77.4931,105.53123,0.025913,231.5
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,45575,2019,22.4265,77.4931,729.336074,0.179089,260.769231
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,45575,2019,22.4265,77.4931,797.873291,0.195919,107.0
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,45575,2019,22.4265,77.4931,449.647038,0.110411,138.8125


Handle Missing Values

In [2]:
print(df.isnull().sum())
# Impute Cost_Per_Campaign missing with median (from zero campaign division)
if 'Cost_Per_Campaign' in df.columns:
    df['Cost_Per_Campaign'] = df['Cost_Per_Campaign'].fillna(df['Cost_Per_Campaign'].median())


City/District                         0
Waste Type                            0
Waste Generated (Tons/Day)            0
Recycling Rate (%)                    0
Population Density (People/km²)       0
Municipal Efficiency Score (1-10)     0
Disposal Method                       0
Cost of Waste Management (₹/Ton)      0
Awareness Campaigns Count             0
Landfill Name                         0
Landfill Capacity (Tons)              0
Year                                  0
Landfill_Lat                          0
Landfill_Long                         0
Waste_Per_Capita_kg                   0
Landfill_Utilization_Ratio            0
Cost_Per_Campaign                    48
dtype: int64


Temporal Feature: Year Category

In [3]:
df['Year_Category'] = pd.cut(df['Year'], bins=[2018, 2020, 2023], labels=['Before_2021', '2021_and_after'])


Interaction Features

In [4]:
# Interaction between municipal efficiency and population density
df['Efficiency_PopDensity_Interaction'] = df['Municipal Efficiency Score (1-10)'] * df['Population Density (People/km²)']
# Interaction between awareness and efficiency
df['Campaign_Efficiency_Interaction'] = df['Awareness Campaigns Count'] * df['Municipal Efficiency Score (1-10)']


In [5]:
for col in ['Waste Generated (Tons/Day)', 'Cost of Waste Management (₹/Ton)', 
            'Awareness Campaigns Count', 'Landfill Capacity (Tons)', 
            'Waste_Per_Capita_kg', 'Landfill_Utilization_Ratio', 'Cost_Per_Campaign']:
    print(f"{col}: skewness = {df[col].skew():.2f}")


Waste Generated (Tons/Day): skewness = -0.02
Cost of Waste Management (₹/Ton): skewness = -0.06
Awareness Campaigns Count: skewness = -0.04
Landfill Capacity (Tons): skewness = 0.05
Waste_Per_Capita_kg: skewness = 2.60
Landfill_Utilization_Ratio: skewness = 1.33
Cost_Per_Campaign: skewness = 3.84


In [6]:
skewed_cols = [col for col in [
    'Waste Generated (Tons/Day)', 'Cost of Waste Management (₹/Ton)', 
    'Awareness Campaigns Count', 'Landfill Capacity (Tons)',
    'Waste_Per_Capita_kg', 'Landfill_Utilization_Ratio', 'Cost_Per_Campaign'
] if abs(df[col].skew()) > 1]

for col in skewed_cols:
    df[f'Log_{col}'] = np.log1p(df[col])


In [7]:
# Frequency encode city
city_freq = df['City/District'].value_counts(normalize=True)
df['City_FreqEnc'] = df['City/District'].map(city_freq)

# You can do similar for landfill or other high-cardinality features


In [8]:
save_path = r"D:\Portfolio\waste_management\data\processed\waste_data_feature_engineered.csv"
df.to_csv(save_path, index=False)
print(f"Feature engineered data saved to {save_path}")


Feature engineered data saved to D:\Portfolio\waste_management\data\processed\waste_data_feature_engineered.csv
