<a href="https://colab.research.google.com/github/VidulaN/Environmap-project/blob/main/data_collection_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load and clean air quality data
air_quality = pd.read_csv("air_quality.csv")

# Check if 'Total_Emissions' column exists
if 'Total_Emissions' not in air_quality.columns:
    raise KeyError("'Total_Emissions' column is not found!")

# Clean the relevant columns
for col in ["NOx", "PM10", "PM2.5", "Total_Emissions"]:
    air_quality[col] = air_quality[col].astype(str).str.replace(",", "").astype(float)

# Load and clean CO2 emissions data
co2 = pd.read_csv("co2_emissions.csv")

# Check if 'CO2_emissions' column exists
if 'CO2_emissions' not in co2.columns:
    raise KeyError("'CO2_emissions' column is not found!")

co2["CO2_emissions"] = co2["CO2_emissions"].astype(str).str.replace(",", "").astype(float)

# Load and clean noise data
noise = pd.read_csv("noise_pollution.csv")

# Check if 'Sound_level' column exists
if 'Sound_level' not in noise.columns:
    raise KeyError("'Sound_level' column is not found!")

# Clean the 'Sound_level' column
noise["Sound_level"] = noise["Sound_level"].astype(str).str.extract("(\d+)").astype(float)

# Merge all on Borough and Year
df = air_quality.merge(co2, on=["Borough", "Year"], how="inner").merge(noise, on=["Borough", "Year"], how="inner")
df.dropna(inplace=True)

# Filter to 2010–2022 data for model training and historical output
historical_df = df[(df["Year"] >= 2010) & (df["Year"] <= 2022)]

# Feature engineering: Create a normalized Year feature
df["Year_normalized"] = (df["Year"] - df["Year"].min()) / (df["Year"].max() - df["Year"].min())
historical_df["Year_normalized"] = (historical_df["Year"] - df["Year"].min()) / (df["Year"].max() - df["Year"].min())

# Encode Borough
df["Borough_Code"] = df["Borough"].astype("category").cat.codes
historical_df["Borough_Code"] = historical_df["Borough"].astype("category").cat.codes

# Combine features for prediction
X = historical_df[["Year_normalized", "Borough_Code"]]

# Predict for 2030
future_years = [2030]
boroughs = df["Borough"].unique()
future_input = pd.DataFrame([{"Year": year, "Borough": borough} for year in future_years for borough in boroughs])

# Normalize future year
future_input["Year_normalized"] = (future_input["Year"] - df["Year"].min()) / (df["Year"].max() - df["Year"].min())

# Encode Borough_Code
future_input["Borough_Code"] = future_input["Borough"].astype("category").cat.codes
future_X = future_input[["Year_normalized", "Borough_Code"]]

# Models and predictions

# Air Quality (NOx, PM10, PM2.5, Total_Emissions)
air_features = ["NOx", "PM10", "PM2.5", "Total_Emissions"]
model_air = RandomForestRegressor(n_estimators=100, random_state=42)
model_air.fit(X, historical_df[air_features])
air_pred = model_air.predict(future_X)

air_future = future_input.copy()
air_future[air_features] = air_pred
air_result = pd.concat([
    historical_df[["Borough", "Year"] + air_features],
    air_future[["Borough", "Year"] + air_features]
]).sort_values(["Borough", "Year"])
air_result.to_csv("air_quality_predictions.csv", index=False)

# CO2 Emissions
model_co2 = RandomForestRegressor(n_estimators=100, random_state=42)
model_co2.fit(X, historical_df["CO2_emissions"])
co2_pred = model_co2.predict(future_X)

co2_future = future_input.copy()
co2_future["CO2_emissions"] = co2_pred
co2_result = pd.concat([
    historical_df[["Borough", "Year", "CO2_emissions"]],
    co2_future[["Borough", "Year", "CO2_emissions"]]
]).sort_values(["Borough", "Year"])
co2_result.to_csv("co2_emissions_predictions.csv", index=False)

# Noise Levels
model_noise = RandomForestRegressor(n_estimators=100, random_state=42)
model_noise.fit(X, historical_df["Sound_level"])
noise_pred = model_noise.predict(future_X)

noise_future = future_input.copy()
noise_future["Sound_level"] = noise_pred
noise_result = pd.concat([
    historical_df[["Borough", "Year", "Sound_level"]],
    noise_future[["Borough", "Year", "Sound_level"]]
]).sort_values(["Borough", "Year"])
noise_result.to_csv("noise_pollution_predictions.csv", index=False)

print("Predictions saved with 2010–2022 historical data and 2030 forecast:")
print("- air_quality_predictions.csv")
print("- co2_emissions_predictions.csv")
print("- noise_pollution_predictions.csv")


KeyError: "'Total_Emissions' column is not found!"