## Data Loading and Preparation


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor


# Load the dataset
df = pd.read_csv("Final_AirQuality_Geolocation_weather.csv")

# Display the first few rows of the dataframe
df.head()

## Exploratory Data Analysis


In [None]:
# Basic statistics
df.describe()

# Checking for missing values
df.isnull().sum()

In [None]:
# Set the overall figure size
plt.figure(figsize=(15, 15))

# Create the pairplot with the specified color palette
pairplot = sns.pairplot(
    df[
        [
            "Air Pollution Average [ug/m3] NO2",
            "Air Pollution Average [ug/m3] PM10",
            "Air Pollution Average [ug/m3] O3",
            "Air Pollution Average [ug/m3] PM2.5",
            "Average Renewable Data",
        ]
    ],
    plot_kws={"color": "skyblue"},
)

# Rotate x-axis and y-axis labels to prevent overlap
for ax in pairplot.axes.flatten():
    # Rotate x-axis labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")
    # Rotate y-axis labels
    plt.setp(ax.get_yticklabels(), rotation=45)

# Adjust the spacing of the plots to prevent title overlap and improve layout
pairplot.fig.subplots_adjust(top=0.92, wspace=0.2, hspace=0.2)

# Display the plot
plt.show()

## Correlation Analysis


In [None]:
# Selecting relevant columns for the correlation analysis
columns_of_interest = [
    "Air Pollution Average [ug/m3] NO2",
    "Air Pollution Average [ug/m3] O3",
    "Air Pollution Average [ug/m3] PM10",
    "Air Pollution Average [ug/m3] PM2.5",
    "Average Renewable Data",
    "Mean GDP",
    "Population",
    "Populated Area [km2]",
    "Premature Deaths NO2",
    "Premature Deaths O3",
    "Premature Deaths PM10",
    "Premature Deaths PM2.5",
]

# Calculating the correlation matrix
correlation_matrix = df[columns_of_interest].corr()

# Plotting the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

## Impact of Economic and Demographic Factors


### Analysis of population density and air pollution


In [None]:
# Calculate the population density
df["Population Density"] = df["Population"] / df["Populated Area [km2]"]

# List of pollutants to plot against population density
pollutants = [
    "Air Pollution Average [ug/m3] NO2",
    "Air Pollution Average [ug/m3] PM10",
    "Air Pollution Average [ug/m3] O3",
    "Air Pollution Average [ug/m3] PM2.5",
]

# Set the size of the plots
plt.figure(figsize=(12, 8))

# Create a scatter plot for each pollutant
for pollutant in pollutants:
    sns.scatterplot(data=df, x="Population Density", y=pollutant)
    plt.title(f"Population Density vs. {pollutant}")
    plt.xlabel("Population Density (people per km²)")
    plt.ylabel(f"{pollutant} (µg/m³)")
    plt.show()

## Health Outcomes Analysis


### Relationship between all air pollutants and their corresponding health outcomes.


In [None]:
# List of pollutants and their corresponding health outcomes to plot
pollutants_health_outcomes = {
    "Air Pollution Average [ug/m3] NO2": "Premature Deaths NO2",
    "Air Pollution Average [ug/m3] PM10": "Premature Deaths PM10",
    "Air Pollution Average [ug/m3] O3": "Premature Deaths O3",
    "Air Pollution Average [ug/m3] PM2.5": "Premature Deaths PM2.5",
}

# Set the size of the plots
plt.figure(figsize=(12, 8))

# Create a scatter plot for each pollutant and its health outcome
for pollutant, health_outcome in pollutants_health_outcomes.items():
    sns.scatterplot(data=df, x=pollutant, y=health_outcome)
    plt.title(f"{health_outcome} vs. {pollutant}")
    plt.xlabel(f"{pollutant} (µg/m³)")
    plt.ylabel(f"{health_outcome}")
    plt.show()

## Predictive Modeling 


In [None]:
X = df[["Average Renewable Data", "Population"]]
y = df["Air Pollution Average [ug/m3] PM10"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model evaluation
print("Mean squared error:", mean_squared_error(y_test, y_pred))
print("Coefficient of determination (R^2):", r2_score(y_test, y_pred))

### linear regression to assess the impact of economic and demographic factors on air pollution levels.


In [None]:
# Function to perform linear regression and display results
def perform_regression(df, independent_vars, dependent_var):
    X = df[independent_vars]
    y = df[dependent_var]

    # Fitting the model
    model = LinearRegression()
    model.fit(X, y)

    # Coefficients and intercept
    intercept = model.intercept_
    coefficients = model.coef_

    # Displaying results
    print(f"Regression Model for predicting {dependent_var}:")
    print(f"Intercept: {intercept}")
    for i, col in enumerate(independent_vars):
        print(f"Coefficient for {col}: {coefficients[i]}")


# Economic impact on Air Pollution (NO2 as an example)
economic_vars = ["Mean GDP", "Population"]
perform_regression(df, economic_vars, "Air Pollution Average [ug/m3] NO2")

# Demographic impact on Air Pollution (NO2 as an example)
demographic_vars = ["Populated Area [km2]"]
perform_regression(df, demographic_vars, "Air Pollution Average [ug/m3] NO2")

# Exploring the relationship between air pollution and health outcomes
health_outcome = "Premature Deaths NO2"
perform_regression(df, ["Air Pollution Average [ug/m3] NO2"], health_outcome)

### Random Forest model 


In [None]:
# Selecting features and target for the Random Forest model
features = [
    "Mean GDP",
    "Temperature",
    "Wind Speed",
    "Population",
    "Average Renewable Data",
    "Air Pollution Average [ug/m3] NO2",
    "Air Pollution Average [ug/m3] O3",
]
target = "Air Pollution Average [ug/m3] PM10"

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target], test_size=0.3, random_state=42
)

# Creating the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
}
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Using the best estimator from grid search
best_rf_model = grid_search.best_estimator_

# Cross-validation
cv_scores = cross_val_score(best_rf_model, X_train,
                            y_train, cv=5, scoring="r2")
print(f"Cross-Validation R^2 scores: {cv_scores}")
print(f"Mean CV R^2: {np.mean(cv_scores)}")

# Predicting and evaluating the model
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Coefficient of Determination (R^2): {r2}")

# Feature Importance Analysis
importances = best_rf_model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

print("Feature Importances:")
for index in sorted_indices:
    print(f"{features[index]}: {importances[index]}")

## Comparative Analysis of Renewable Energy Consumption in Europe


In [None]:
# Comparing different countries
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x="Country", y="Average Renewable Data")
plt.xticks(rotation=45)
plt.show()

## Trend Analysis

### Analyzing the Shift Towards Renewables


In [None]:
# Group the data by 'Year' and calculate the average PM10 level for each year.

pollutants = [
    "Air Pollution Average [ug/m3] NO2",
    "Air Pollution Average [ug/m3] O3",
    "Air Pollution Average [ug/m3] PM10",
    "Air Pollution Average [ug/m3] PM2.5",
]

# Plotting the trend for each pollutant
for pollutant in pollutants:
    trend_data = df.groupby("Year")[pollutant].mean().reset_index()
    plt.figure(figsize=(10, 5))
    plt.plot(trend_data["Year"], trend_data[pollutant], marker="o")
    plt.title(f"Average {pollutant} Levels Over Years")
    plt.xlabel("Year")
    plt.ylabel(f"{pollutant} Level (µg/m^3)")
    plt.grid(True)
    plt.show()

## Geolocation and Average AQI Analysis


### Visualizing the AQI of European Countries


In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import ast

In [None]:
# Define the required functions



def correct_coordinates(data, country, new_coords):
    """

    Correct the coordinates for a given country in the DataFrame.


    Parameters:

    data (DataFrame): The pandas DataFrame containing the data.

    country (str): The name of the country to correct coordinates for.

    new_coords (tuple): The new coordinates in the format (latitude, longitude).
    """

    data.loc[data["Country"] == country, "coordinates"] = str(new_coords)



def convert_to_tuple(coord_str):
    """

    Convert a string representation of coordinates to a tuple,

    rounding to two decimal places.


    Parameters:

    coord_str (str): The string representation of the coordinates.


    Returns:

    tuple: The coordinates as a tuple.
    """

    try:

        coord = ast.literal_eval(coord_str)

        return round(coord[0], 2), round(coord[1], 2)

    except:

        return (None, None)



def create_geodataframe(data):
    """

    Create a GeoDataFrame from the pandas DataFrame for plotting.


    Parameters:

    data (DataFrame): The pandas DataFrame containing the data.


    Returns:

    GeoDataFrame: The GeoDataFrame ready for plotting.
    """

    gdf = gpd.GeoDataFrame(
        data,

        geometry=gpd.points_from_xy(

            [coord[1] for coord in data["coordinates"]],

            [coord[0] for coord in data["coordinates"]],
        ),
    )

    gdf.set_crs(epsg=4326, inplace=True)
    return gdf



def plot_map(gdf):
    """

    Plot the map using the provided GeoDataFrame.


    Parameters:

    gdf (GeoDataFrame): The GeoDataFrame containing the data to plot.
    """

    fig, ax = plt.subplots(1, 1, figsize=(15, 10))

    world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

    world.boundary.plot(ax=ax, linewidth=1)
    gdf.plot(

        ax=ax,

        column="Average AQI",

        legend=True,

        cmap="coolwarm",

        markersize=150,

        alpha=0.6,

        edgecolor="black",

        linewidth=0.5,
    )


    ax.set_xlim(-20, 50)  # Longitude limits

    ax.set_ylim(30, 75)  # Latitude limits


    for idx, row in gdf.iterrows():
        plt.annotate(

            text=row["Country"],

            xy=(row["geometry"].x + 1, row["geometry"].y),

            horizontalalignment="left",

            fontsize=8,

            color="black",

            weight="light",
        )

    plt.title(

        "Average Air Quality Index (AQI) of European Countries (Based on Corrected Coordinates)"
    )

    plt.show()

In [None]:
# Load the dataset
data = pd.read_csv("Final_AirQuality_Geolocation_weather.csv")

# Correcting coordinates for specific countries
correct_coordinates(data, "France", (46.23, 2.21))
correct_coordinates(data, "Denmark", (56.26, 9.50))
correct_coordinates(data, "Finland", (64.95, 25.75))

# Convert the string representation of coordinates to tuples
data["coordinates"] = data["coordinates"].apply(convert_to_tuple)

# Calculate the average AQI
data["Average AQI"] = data[
    ["Annual AQI NO2", "Annual AQI O3", "Annual AQI PM10", "Annual AQI PM2.5"]
].mean(axis=1)

# Create a GeoDataFrame for plotting
gdf = create_geodataframe(data)

# Plotting the map
plot_map(gdf)

* Northern Europe: Displays low AQI, indicating clean air.
* Central Europe: Shows mixed AQI levels; certain areas, including Czech Republic and Poland, exhibit higher AQI, suggesting poorer air quality.
* Southern Europe: Generally moderate AQI levels, with variations among countries.
* Western Europe: Ranges from good to moderate AQI, with smaller countries exhibiting varying air quality.
* Eastern Europe: Predominantly low AQI, reflecting cleaner air, with some exceptions like Bulgaria showing moderate levels.

**Conclusion**: The AQI varies across Europe, with Northern and Eastern Europe enjoying cleaner air overall. Central Europe has areas of concern with higher AQI, whereas Southern and Western Europe have moderate air quality, necessitating region-specific air quality management strategies.


### Impact of Elevation on Average AQI


In [None]:
# Checking the distribution of the variables using histograms
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
sns.histplot(data["elevation"], kde=True)
plt.title("Distribution of Elevation")

plt.subplot(1, 2, 2)
sns.histplot(data["Average AQI"], kde=True)
plt.title("Distribution of Average AQI")

plt.show()

In [None]:
# Performing Pearson correlation analysis
pearson_corr, p_value_pearson = stats.pearsonr(
    data["elevation"].dropna(), data["Average AQI"].dropna()
)

# Performing Linear Regression analysis
slope, intercept, r_value, p_value_reg, std_err = stats.linregress(
    data["elevation"].dropna(), data["Average AQI"].dropna()
)

pearson_corr, p_value_pearson, r_value, p_value_reg

**Data Distribution Analysis**

Elevation data is skewed, with a histogram showing a tail, indicating a non-normal distribution.
Average AQI data is more normally distributed but exhibits some skewness.

**Correlation Analysis**

A Pearson Correlation Coefficient of 0.278 indicates a weak positive correlation between elevation and average AQI.
A p-value of 0.0217 suggests the correlation is statistically significant but should be cautiously interpreted due to the skewness in the elevation data.

**Linear Regression Analysis**
An R-value of 0.278 reaffirms the weak positive relationship between elevation and average AQI.
The matching p-value of 0.0217 from the regression analysis confirms the statistical significance of the results.

**Conclusion**
The statistical analysis indicates a weak but significant positive correlation between elevation and average AQI. The non-normal distribution of elevation data advises caution in interpreting these results and suggests the potential benefit of further analysis using alternative statistical models.
