In [None]:
import pandas as pd

# Define the breakpoints for each pollutant
breakpoints = {
    "pm25": [(0, 35), (35.1, 75), (75.1, 115), (115.1, 150), (150.1, 250), (250.1, 350), (350.1, 500)],
    "pm10": [(0, 50), (50.1, 150), (150.1, 250), (250.1, 350), (350.1, 420), (420.1, 500), (500.1, 600)],
    "o3": [(0, 100), (100.1, 160), (160.1, 215), (215.1, 265), (265.1, 800), (800.1, float("inf")), (float("inf"), float("inf"))],
    "no2": [(0, 40), (40.1, 80), (80.1, 180), (180.1, 280), (280.1, 565), (565.1, 750), (750.1, 940)],
    "so2": [(0, 50), (50.1, 150), (150.1, 475), (475.1, 800), (800.1, 1600), (1600.1, 2100), (2100.1, 620)],
    "co": [(0, 2), (2.1, 4), (4.1, 14), (14.1, 24), (24.1, 36), (36.1, 48), (48.1, 60)]
}

iaqi_ranges = [(0, 50), (51, 100), (101, 150), (151, 200), (201, 300), (301, 400), (401, 500)]

In [None]:
# Function to calculate IAQI for a given pollutant
def calculate_iaqi(value, pollutant):
    """Calculate IAQI for a given pollutant value."""
    if value is None or value < 0:
        return None  # Invalid or missing data
    
    for i, (low, high) in enumerate(breakpoints[pollutant]):
        if low <= value <= high:
            iaqi_low, iaqi_high = iaqi_ranges[i]
            iaqi = iaqi_low + ((value - low) / (high - low)) * (iaqi_high - iaqi_low)
            return round(iaqi)
    return None  # Value is out of range

In [None]:
# Function to calculate AQI for each row and determine the pollutant
def calculate_aqi_for_row(row):
    """Calculate AQI for a given row based on IAQI of pollutants and record the pollutant."""
    pollutants = ["pm25", "pm10", "o3", "no2", "so2", "co"]
    
    iaqi_values = {
        "pm25": calculate_iaqi(row["pm25"], "pm25"),
        "pm10": calculate_iaqi(row["pm10"], "pm10"),
        "o3": calculate_iaqi(row["o3"], "o3"),
        "no2": calculate_iaqi(row["no2"], "no2"),
        "so2": calculate_iaqi(row["so2"], "so2"),
        "co": calculate_iaqi(row["co"], "co")
    }
    
    # Remove pollutants with None (invalid IAQI)
    valid_iaqi = {key: value for key, value in iaqi_values.items() if value is not None}
    
    if not valid_iaqi:
        return None, None  # No valid IAQI
    
    # Calculate the AQI as the maximum of IAQI values
    aqi = max(valid_iaqi.values())
    
    # Find the pollutant corresponding to the highest IAQI
    determining_pollutant = max(valid_iaqi, key=valid_iaqi.get)
    
    return aqi, determining_pollutant

In [None]:
# Read the data from the CSV file (replace 'your_file.csv' with your actual file path)
file_path = "~/AQIprediction/combined_data.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Convert the columns to numeric, handling errors by coercing invalid values to NaN
numeric_columns = ["pm25", "pm10", "o3", "no2", "so2", "co"]  # Add other columns if necessary
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Calculate AQI and determining pollutant for each row and add it to the DataFrame
df[["AQI", "Determining_Pollutant"]] = df.apply(lambda row: calculate_aqi_for_row(row), axis=1, result_type="expand")

# Save the updated DataFrame to the same CSV file or a new file
df.to_csv(file_path, index=False)  # Use a different file path if you don't want to overwrite

# Display the result (optional)
print(df)