## Urbanization Vs PM 2.5 (eg, 2016)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Step 1: Data Loading
# Load Urban.csv using default comma delimiter.
urban = pd.read_csv("Urban.csv", encoding="utf-8")

# Load Air quality.csv with automatic delimiter detection using the Python engine.
air_quality = pd.read_csv("Air quality.csv", sep=None, engine="python", encoding="utf-8", on_bad_lines="skip")


SyntaxError: invalid syntax (381863025.py, line 6)

In [None]:
# Choose the analysis year (e.g., 2016).
selected_year = 2016

In [None]:
# See the first five rows of data
urban.head()

In [None]:
air_quality.head()

In [None]:
print(urban.columns)

In [None]:
print(air_quality.columns)

In [None]:
# Step 2: Data Preprocessing 
# Verify and convert "Measurement Year" to numeric.
if "Measurement Year" not in air_quality.columns:
    raise KeyError("The 'Measurement Year' column is missing in the air quality data.")
air_quality["Measurement Year"] = pd.to_numeric(air_quality["Measurement Year"], errors="coerce")

# Filter air quality data for the selected year.
aq_year = air_quality[air_quality["Measurement Year"] == selected_year]

# Convert the selected year to a string for column lookup in the urban dataset.
year_str = str(selected_year)

# Extract the relevant year column from urban data and rename it to "Urbanization".
if year_str not in urban.columns:
    raise KeyError(f"The '{year_str}' column is missing in the Urban data.")
urban_subset = urban[["Country Name", year_str]].rename(columns={year_str: "Urbanization"})

# Clean country names by stripping whitespace and converting to lower case.
aq_year["WHO Country Name"] = aq_year["WHO Country Name"].str.strip().str.lower()
urban_subset["Country Name"] = urban_subset["Country Name"].str.strip().str.lower()

In [None]:
# Step 3: Data Merging 
# Merge the two datasets on the country name.
merged_df = pd.merge(
    urban_subset,
    aq_year[["WHO Country Name", "PM2.5 (μg/m3)"]],
    left_on="Country Name",
    right_on="WHO Country Name",
    how="inner"
).drop(columns=["WHO Country Name"])

# Preliminary check of the merged data.
print("\n--- Before Data Cleaning ---")
print(merged_df.info())
print(merged_df.head())


In [None]:
# Step 4: Data Cleaning 
# Convert "Urbanization" and "PM2.5 (μg/m3)" to numeric (non-numeric values become NaN).
merged_df["Urbanization"] = pd.to_numeric(merged_df["Urbanization"], errors="coerce")
merged_df["PM2.5 (μg/m3)"] = pd.to_numeric(merged_df["PM2.5 (μg/m3)"], errors="coerce")

# Remove rows with missing (NaN) values in either column.
merged_df = merged_df.dropna(subset=["Urbanization", "PM2.5 (μg/m3)"])

# Remove rows with infinite values.
merged_df = merged_df[~merged_df.isin([np.inf, -np.inf]).any(axis=1)]

# Filter out unrealistic values:
# - Urbanization should be within 0 to 100 (as percentages).
# - PM2.5 values should be non-negative.
merged_df = merged_df[(merged_df["Urbanization"] >= 0) & (merged_df["Urbanization"] <= 100)]
merged_df = merged_df[merged_df["PM2.5 (μg/m3)"] >= 0]

# Check the cleaned data.
print("\n--- After Data Cleaning ---")
print(merged_df.info())
print(merged_df.describe())

In [None]:
#Step 5: Data Analysis
if merged_df.empty:
    print("The merged dataset is empty after cleaning. Please review the cleaning rules or input data.")
else:
    # Visualization: Scatter plot of Urbanization vs. PM2.5.
    plt.figure(figsize=(8,6))
    plt.scatter(merged_df["Urbanization"], merged_df["PM2.5 (μg/m3)"])
    plt.xlabel(f"Urbanization ({selected_year})")
    plt.ylabel(f"PM2.5 (μg/m3) ({selected_year})")
    plt.title(f"Urbanization vs. PM2.5 ({selected_year})")
    plt.grid(True)
    plt.show()

    # Calculate and print the Pearson correlation coefficient.
    correlation = merged_df["Urbanization"].corr(merged_df["PM2.5 (μg/m3)"])
    print(f"Correlation between Urbanization and PM2.5 ({selected_year}): {correlation:.3f}")

    # OLS Regression Analysis: Predict PM2.5 using Urbanization.
    X = sm.add_constant(merged_df["Urbanization"])
    y = merged_df["PM2.5 (μg/m3)"]
    model = sm.OLS(y, X).fit()

    # Print the regression model summary.
    print("\nRegression Model Summary:")
    print(model.summary())

## Urbanization Vs NO2 (eg, 2016)

In [None]:
# Step 3: Data Merging
# Ensure the NO2 column exists in the air quality data
if "NO2 (μg/m3)" not in aq_year.columns:
    raise KeyError("The 'NO2 (μg/m3)' column is missing in the air quality data.")

# Merge datasets on matching country names
merged_df = pd.merge(
    urban_subset,
    aq_year[["WHO Country Name", "NO2 (μg/m3)"]],
    left_on="Country Name",
    right_on="WHO Country Name",
    how="inner"
).drop(columns=["WHO Country Name"])

print("\n--- Before Data Cleaning ---")
print(merged_df.info())
print(merged_df.head())

In [None]:
# Step 4: Data Cleaning
# Convert "Urbanization" and "NO2 (μg/m3)" columns to numeric
merged_df["Urbanization"] = pd.to_numeric(merged_df["Urbanization"], errors="coerce")
merged_df["NO2 (μg/m3)"] = pd.to_numeric(merged_df["NO2 (μg/m3)"], errors="coerce")

# Drop rows with missing values in either column
merged_df = merged_df.dropna(subset=["Urbanization", "NO2 (μg/m3)"])

# Remove rows with infinite values
merged_df = merged_df[~merged_df.isin([np.inf, -np.inf]).any(axis=1)]

# Filter out unrealistic values:
# - Urbanization should be between 0 and 100
# - NO2 values should be non-negative
merged_df = merged_df[(merged_df["Urbanization"] >= 0) & (merged_df["Urbanization"] <= 100)]
merged_df = merged_df[merged_df["NO2 (μg/m3)"] >= 0]

print("\n--- After Data Cleaning ---")
print(merged_df.info())
print(merged_df.describe())


In [None]:
# Step 5: Data Analysis
if merged_df.empty:
    print("The cleaned merged dataset is empty. Please review the cleaning rules or input data.")
else:
    # Visualization: Scatter plot of Urbanization vs. NO2
    plt.figure(figsize=(8,6))
    plt.scatter(merged_df["Urbanization"], merged_df["NO2 (μg/m3)"])
    plt.xlabel(f"Urbanization ({selected_year})")
    plt.ylabel(f"NO2 (μg/m3) ({selected_year})")
    plt.title(f"Urbanization vs. NO2 ({selected_year})")
    plt.grid(True)
    plt.show()

    # Compute and display the Pearson correlation coefficient
    correlation = merged_df["Urbanization"].corr(merged_df["NO2 (μg/m3)"])
    print(f"Correlation between Urbanization and NO2 ({selected_year}): {correlation:.3f}")

    # OLS Regression Analysis: Predict NO2 using Urbanization
    X = sm.add_constant(merged_df["Urbanization"])
    y = merged_df["NO2 (μg/m3)"]
    model = sm.OLS(y, X).fit()

    # Print the regression model summary
    print("\nRegression Model Summary:")
    print(model.summary())

## Urbanization Vs PM10 (eg, 2016)

In [None]:
#Step 3: Data Merging
# Ensure the PM10 column exists in the air quality data.
if "PM10 (μg/m3)" not in aq_year.columns:
    raise KeyError("The 'PM10 (μg/m3)' column is missing in the air quality data.")

# Merge the two datasets on matching country names.
merged_df = pd.merge(
    urban_subset,
    aq_year[["WHO Country Name", "PM10 (μg/m3)"]],
    left_on="Country Name",
    right_on="WHO Country Name",
    how="inner"
).drop(columns=["WHO Country Name"])

print("\n--- Before Data Cleaning ---")
print(merged_df.info())
print(merged_df.head())


In [None]:
# Step 4: Data Cleaning
# Convert "Urbanization" and "PM10 (μg/m3)" to numeric (non-numeric values become NaN).
merged_df["Urbanization"] = pd.to_numeric(merged_df["Urbanization"], errors="coerce")
merged_df["PM10 (μg/m3)"] = pd.to_numeric(merged_df["PM10 (μg/m3)"], errors="coerce")

# Remove rows with missing (NaN) values in either column.
merged_df = merged_df.dropna(subset=["Urbanization", "PM10 (μg/m3)"])

# Remove rows with infinite values.
merged_df = merged_df[~merged_df.isin([np.inf, -np.inf]).any(axis=1)]

# Filter out unrealistic values:
# - Urbanization should be within 0 to 100.
# - PM10 should be non-negative.
merged_df = merged_df[(merged_df["Urbanization"] >= 0) & (merged_df["Urbanization"] <= 100)]
merged_df = merged_df[merged_df["PM10 (μg/m3)"] >= 0]

print("\n--- After Data Cleaning ---")
print(merged_df.info())
print(merged_df.describe())

In [None]:
# Step 5: Data Analysis
if merged_df.empty:
    print("The cleaned merged dataset is empty. Please review the cleaning rules or input data.")
else:
    # Visualization: Scatter plot of Urbanization vs. PM10.
    plt.figure(figsize=(8,6))
    plt.scatter(merged_df["Urbanization"], merged_df["PM10 (μg/m3)"])
    plt.xlabel(f"Urbanization ({selected_year})")
    plt.ylabel(f"PM10 (μg/m³) ({selected_year})")
    plt.title(f"Urbanization vs. PM10 ({selected_year})")
    plt.grid(True)
    plt.show()

    # Compute and display the Pearson correlation coefficient.
    correlation = merged_df["Urbanization"].corr(merged_df["PM10 (μg/m3)"])
    print(f"Correlation between Urbanization and PM10 ({selected_year}): {correlation:.3f}")

    # OLS Regression Analysis: Predict PM10 using Urbanization.
    X = sm.add_constant(merged_df["Urbanization"])
    y = merged_df["PM10 (μg/m3)"]
    model = sm.OLS(y, X).fit()

    # Print the regression model summary.
    print("\nRegression Model Summary:")
    print(model.summary())