In [247]:
import pandas as pd
import ssl
import numpy as np
import plotly.express as px
from scipy.stats import linregress
import plotly.graph_objects as go

In [248]:
# Disable SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

def clean_dataset(df):
    """Cleans the dataset: removes duplicates, handles missing values, sorts, and drops unnecessary columns."""
    df_cleaned = df.drop_duplicates()

    # Handle missing values (NaN)
    if df_cleaned.isnull().any().any():
        # Fill NaN values with the mean of numeric columns
        df_cleaned = df_cleaned.fillna(df_cleaned.mean(numeric_only=True))

    # Sort data by a specific column (e.g., Year)
    if "Year" in df_cleaned.columns:
        df_cleaned = df_cleaned.sort_values(by="Year")

    # Drop irrelevant columns (e.g., ID)
    if "ID" in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=["ID"])

    return df_cleaned

In [249]:
def fetch_data():
    """Fetches datasets from the specified URLs."""
    emissions_data = pd.read_csv(
        "https://ourworldindata.org/grapher/annual-co-emissions-by-region.csv?v=1&csvType=full&useColumnShortNames=true",
        storage_options={'User-Agent': 'Our World In Data data fetch/1.0'}
    )

    temperature_data = pd.read_csv(
        "https://ourworldindata.org/grapher/monthly-average-surface-temperatures-by-year.csv?v=1&csvType=full&useColumnShortNames=false",
        storage_options={'User-Agent': 'Our World In Data data fetch/1.0'}
    )

    return emissions_data, temperature_data

In [250]:
def transform_temperature_data(temperature_data):
    """Transforms the temperature dataset into an analyzable format."""
    temperature_data = clean_dataset(temperature_data)

    # Ensure column names for years are numeric
    temperature_data.columns = [
        int(col) if col.isdigit() else col
        for col in temperature_data.columns
    ]

    # Unpivot temperature data
    temperature_melted = temperature_data.melt(
        id_vars=["Entity", "Code", "Year"],
        var_name="Year_Column",
        value_name="Temperature"
    )

    # Convert 'Year_Column' to numeric values
    temperature_melted["Year_Column"] = pd.to_numeric(temperature_melted["Year_Column"], errors="coerce")

    # Rename columns
    temperature_melted.rename(columns={"Year_Column": "Year", "Year": "Month"}, inplace=True)

    return temperature_melted


In [251]:
def filter_data(temperature_data, emissions_data, regions):
    """Filters the temperature and emissions data for the specified regions and shared years."""

    temperature_filtered = temperature_data[temperature_data["Entity"].isin(regions)]
    emissions_filtered = emissions_data[emissions_data["Entity"].isin(regions)]

    # Find common years
    common_years = set(temperature_filtered["Year"]).intersection(set(emissions_filtered["Year"]))

    temperature_filtered = temperature_filtered[temperature_filtered["Year"].isin(common_years)]
    emissions_filtered = emissions_filtered[emissions_filtered["Year"].isin(common_years)]

    return temperature_filtered, emissions_filtered



In [252]:
def merge_datasets(temperature_data, emissions_data):
    """Merges the temperature and emissions datasets into a single dataset."""
    combined_data = pd.merge(
        temperature_data,
        emissions_data,
        on=["Entity", "Year"],
        how="inner"
    )
    return combined_data

In [253]:
def plot_temp_by_country(emissions_data, region_countries, region_name):
    """Erstellt ein interaktives Liniendiagramm der CO2-Emissionen für alle Länder einer Region."""
    # Daten filtern
    region_data = emissions_data[emissions_data["Entity"].isin(region_countries)]

    # Interaktives Plotly-Diagramm erstellen
    fig = px.line(
        region_data,
        x="Year",
        y="Temperature",
        color="Entity",
        title=f"Temperature in {region_name} nach Ländern",
        labels={"Temperature": "Temperature", "Year": "Jahr", "Entity": "Entity"}
    )

    # Layout anpassen
    fig.update_layout(
        legend_title_text="Länder",
        title_font_size=16,
        xaxis_title_font_size=12,
        yaxis_title_font_size=12
    )

    # Grafik anzeigen
    fig.show()

In [254]:
def calculate_p_values(df_combined):
    """Berechnet die lineare Regression und die p-Werte für jede Region."""
    results = []
    
    for region in df_combined["Region"].unique():
        region_data = df_combined[df_combined["Region"] == region]
        x = region_data["Year"].values
        y = region_data["emissions_total"].values

        # Lineare Regression
        slope, intercept, r_value, p_value, std_err = linregress(x, y)

        # Ergebnisse speichern
        results.append({
            "Region": region,
            "Slope": slope,
            "Intercept": intercept,
            "R-squared": r_value**2,
            "P-value": p_value
        })

    return pd.DataFrame(results)

In [255]:
emissions_data, temperature_data = fetch_data()

# Transform temperature data
temperature_melted = transform_temperature_data(temperature_data)

temperature_melted.head()

Unnamed: 0,Entity,Code,Month,Year,Temperature
0,Afghanistan,AFG,1,2024,3.300064
1,Rwanda,RWA,1,2024,19.979807
2,Cook Islands,COK,1,2024,25.643768
3,Saint Helena,SHN,1,2024,23.700409
4,Congo,COG,1,2024,26.152718


In [256]:
emissions_data.head()

Unnamed: 0,Entity,Code,Year,emissions_total
0,Afghanistan,AFG,1949,14656.0
1,Afghanistan,AFG,1950,84272.0
2,Afghanistan,AFG,1951,91600.0
3,Afghanistan,AFG,1952,91600.0
4,Afghanistan,AFG,1953,106256.0


In [257]:
    # Define regions
north_america_countries = [
        "Antigua and Barbuda", "Bahamas", "Belize", "Costa Rica",
        "Dominican Republic", "El Salvador", "Haiti", "Honduras", 
        "Jamaica", "Canada", "Cuba", "Mexico", "Nicaragua", 
        "Panama", "Trinidad and Tobago", "United States"
]

south_america_countries = [
        "Argentina", "Bolivia", "Brazil", "Chile", "Ecuador",
        "Guyana", "Colombia", "Paraguay", "Peru", "Suriname",
        "Uruguay", "Venezuela", "Guatemala"
]

all_americas = north_america_countries + south_america_countries

In [258]:
# Gruppieren nach Jahr und Entity, Berechnung der jährlichen Werte
yearly_summary_n = temperature_melted.groupby(["Entity", "Year"]).agg({
    "Temperature": "mean",
}).reset_index()

# Umbenennen der Spalten für besseres Format
yearly_summary_n.rename(columns={
    "Entity": "Entity",
    "Year": "Year",
    "Temperature": "Temperature"
}, inplace=True)

yearly_summary_n.head()


Unnamed: 0,Entity,Year,Temperature
0,Afghanistan,1950,10.231125
1,Afghanistan,1951,11.242811
2,Afghanistan,1952,11.292934
3,Afghanistan,1953,11.445144
4,Afghanistan,1954,11.093394


In [259]:
# Filter and merge datasets for North America
yearly_summary_na_filtered, emissions_filtered_na = filter_data(yearly_summary_n, emissions_data, north_america_countries)


In [260]:
combined_filtered_data_na = merge_datasets(yearly_summary_na_filtered, emissions_filtered_na)
combined_filtered_data_na.head()

Unnamed: 0,Entity,Year,Temperature,Code,emissions_total
0,Antigua and Barbuda,1950,25.377907,ATG,12213.0
1,Antigua and Barbuda,1951,25.850764,ATG,19541.0
2,Antigua and Barbuda,1952,25.956604,ATG,14656.0
3,Antigua and Barbuda,1953,25.972994,ATG,17099.0
4,Antigua and Barbuda,1954,25.597811,ATG,12213.0


In [261]:
# Gruppieren nach Jahr und Entity, Berechnung der jährlichen Werte
yearly_summarynorden = combined_filtered_data_na.groupby( "Year").agg({
    "Temperature": "mean",  # Durchschnittstemperatur über alle Monate
    "emissions_total": "sum"  # Gesamtemissionen über alle Monate
}).reset_index()

yearly_summarynorden.head()

Unnamed: 0,Year,Temperature,emissions_total
0,1950,20.861395,2736695000.0
1,1951,21.306945,2828319000.0
2,1952,21.487919,2760658000.0
3,1953,21.64327,2823216000.0
4,1954,21.200929,2701530000.0


In [262]:
# Filter and merge datasets for North America
yearly_summary_sa_filtered, emissions_filtered_na = filter_data(yearly_summary_n, emissions_data, south_america_countries)

In [263]:
sueden = merge_datasets(yearly_summary_sa_filtered, emissions_filtered_na)
sueden.head()

Unnamed: 0,Entity,Year,Temperature,Code,emissions_total
0,Argentina,1950,14.214175,ARG,29921192.0
1,Argentina,1951,14.407666,ARG,34962916.0
2,Argentina,1952,14.316195,ARG,36095124.0
3,Argentina,1953,14.520419,ARG,35134740.0
4,Argentina,1954,13.677547,ARG,36750356.0


In [264]:
# Grouping data by 'Year' and calculating annual statistics
yearly_summarysouth = sueden.groupby("Year").agg({
    "Temperature": "mean",  # Calculate the average temperature across all months for each year
    "emissions_total": "sum"  # Sum the total emissions across all months for each year
}).reset_index()

# Display the first few rows of the resulting summary
yearly_summarysouth.head()

Unnamed: 0,Year,Temperature,emissions_total
0,1950,19.933421,114035625.0
1,1951,20.239919,131914233.0
2,1952,20.264052,142109627.0
3,1953,20.44468,141313611.0
4,1954,19.86674,151785007.0


In [265]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_emissions_by_country_large_graph(emissions_data_na, countries_na, region_na, 
                                          emissions_data_sa, countries_sa, region_sa, output_file):
    """
    Creates a large side-by-side graph for CO2 emissions by countries in two regions 
    and saves it as an image.

    Parameters:
        emissions_data_na (DataFrame): CO2 emissions data for the first region (e.g., North America).
        countries_na (list): List of country names for the first region.
        region_na (str): Name of the first region.
        emissions_data_sa (DataFrame): CO2 emissions data for the second region (e.g., South America).
        countries_sa (list): List of country names for the second region.
        region_sa (str): Name of the second region.
        output_file (str): File path to save the output image.
    """
    # Filter data for the specified countries in each region
    region_data_na = emissions_data_na[emissions_data_na["Entity"].isin(countries_na)]
    region_data_sa = emissions_data_sa[emissions_data_sa["Entity"].isin(countries_sa)]

    # Create subplots for side-by-side visualization
    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=(f"CO2 Emissions in {region_na}", f"CO2 Emissions in {region_sa}")
    )

    # Add North America data to the first subplot
    for country in countries_na:
        country_data = region_data_na[region_data_na["Entity"] == country]
        fig.add_trace(
            go.Scatter(
                x=country_data["Year"], 
                y=country_data["emissions_total"], 
                mode='lines', 
                name=f"{country} ({region_na})"
            ),
            row=1, col=1
        )

    # Add South America data to the second subplot
    for country in countries_sa:
        country_data = region_data_sa[region_data_sa["Entity"] == country]
        fig.add_trace(
            go.Scatter(
                x=country_data["Year"], 
                y=country_data["emissions_total"], 
                mode='lines', 
                name=f"{country} ({region_sa})"
            ),
            row=1, col=2
        )

    # Customize the layout
    fig.update_layout(
        title_text="CO2 Emissions by Country: North America vs. South America",
        title_font_size=20,
        showlegend=True,
        legend_title_text="Countries",
        xaxis_title="Year",
        yaxis_title="CO2 Emissions (Million Tons)",
        height=800,  # Height of the graph in pixels
        width=1600   # Width of the graph in pixels
    )

    # Save the graph as an image
    fig.write_image(output_file)
    print(f"The graph has been successfully saved as '{output_file}'.")

plot_emissions_by_country_large_graph(
    combined_filtered_data_na, north_america_countries, "Nordamerika",
    sueden, south_america_countries, "Südamerika",
    "co2_emissions_large_graph.png"
)


The graph has been successfully saved as 'co2_emissions_large_graph.png'.


In [266]:
def plot_temperature_by_region_large_graph(temp_data_na, countries_na, region_na, 
                                           temp_data_sa, countries_sa, region_sa, output_file):
    """
    Creates a large side-by-side graph comparing temperatures for two regions and saves it as an image file.

    Parameters:
        temp_data_na (DataFrame): Temperature data for North America.
        countries_na (list): List of countries in North America.
        region_na (str): Name of the North American region.
        temp_data_sa (DataFrame): Temperature data for South America.
        countries_sa (list): List of countries in South America.
        region_sa (str): Name of the South American region.
        output_file (str): File path to save the output image.
    """
    # Filter data for the specified countries
    region_data_na = temp_data_na[temp_data_na["Entity"].isin(countries_na)]
    region_data_sa = temp_data_sa[temp_data_sa["Entity"].isin(countries_sa)]

    # Create subplots
    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=(f"Temperatures in {region_na}", f"Temperatures in {region_sa}")
    )

    # Add data for North America
    for country in countries_na:
        country_data = region_data_na[region_data_na["Entity"] == country]
        fig.add_trace(
            go.Scatter(
                x=country_data["Year"], 
                y=country_data["Temperature"], 
                mode='lines', 
                name=f"{country} ({region_na})"
            ),
            row=1, col=1
        )

    # Add data for South America
    for country in countries_sa:
        country_data = region_data_sa[region_data_sa["Entity"] == country]
        fig.add_trace(
            go.Scatter(
                x=country_data["Year"], 
                y=country_data["Temperature"], 
                mode='lines', 
                name=f"{country} ({region_sa})"
            ),
            row=1, col=2
        )

    # Customize layout
    fig.update_layout(
        title_text="Temperature Trends: North America vs. South America",
        title_font_size=20,
        showlegend=True,
        legend_title_text="Countries",
        xaxis=dict(title="Year"),
        yaxis=dict(title="Temperature (°C)"),
        height=800,  # Set the height of the graph
        width=1600   # Set the width of the graph
    )

    # Save the graph as an image
    fig.write_image(output_file)
    print(f"The graph has been successfully saved as '{output_file}'.")

plot_temperature_by_region_large_graph(
    combined_filtered_data_na, north_america_countries, "Nordamerika",
    sueden, south_america_countries, "Südamerika",
    "temperature_large_graph.png"
)


The graph has been successfully saved as 'temperature_large_graph.png'.


In [267]:
def plot_temperature_vs_emissions(df_combined):
    """
    Creates a scatter plot showing temperature as a function of total emissions, 
    including a regression line.

    Parameters:
        df_combined (DataFrame): A DataFrame containing columns "emissions_total", 
                                 "Temperature", "Region", and "Year".
    """
    # Extract x (emissions) and y (temperature) data
    x = df_combined["emissions_total"]
    y = df_combined["Temperature"]

    # Compute the linear regression (trendline)
    coeffs = np.polyfit(x, y, deg=1)  # Linear regression coefficients
    trendline = np.polyval(coeffs, x)

    # Create scatter plot
    fig = px.scatter(
        df_combined,
        x="emissions_total",
        y="Temperature",
        color="Region",  # Group data points by region
        title="Temperature vs. CO2 Emissions",
        labels={
            "emissions_total": "CO2 Emissions (Million Tons)",
            "Temperature": "Temperature (°C)"
        },
        hover_data=["Year"]  # Display additional information on hover
    )

    # Display the plot
    fig.show()

In [268]:
    # Region hinzufügen, um die Daten zu unterscheiden
yearly_summarynorden["Region"] = "Nordamerika"
yearly_summarysouth["Region"] = "Südamerika"

df_combined = pd.concat([yearly_summarynorden, yearly_summarysouth], ignore_index=True)

plot_temperature_vs_emissions(df_combined)

In [269]:
def plot_temperature_with_trendlines(df_combined, p_values_df):
    """
    Creates an interactive Plotly chart displaying temperature data with trendlines and p-values.

    Parameters:
        df_combined (DataFrame): A DataFrame containing columns "Region", "Year", and "Temperature".
        p_values_df (DataFrame): A DataFrame with p-values and regression details for each region.
    """
    # Initialize a Plotly figure
    fig = go.Figure()

    # Iterate through each unique region to plot temperature data and trendlines
    for region in df_combined["Region"].unique():
        # Filter data for the current region
        region_data = df_combined[df_combined["Region"] == region]
        x = region_data["Year"]
        y = region_data["Temperature"]

        # Plot temperature data
        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            mode="lines",
            name=f"{region} Temperature",
            line=dict(width=2),
            hovertemplate="Year: %{x}<br>Temperature: %{y:.2f}°C<extra></extra>"
        ))

        # Fit a polynomial trendline (degree 4 for better fit)
        coeffs = np.polyfit(x, y, deg=4)
        trendline = np.polyval(coeffs, x)

        # Add trendline to the chart
        fig.add_trace(go.Scatter(
            x=x,
            y=trendline,
            mode="lines",
            name=f"{region} Trendline",
            line=dict(dash="dash"),
            hovertemplate="Year: %{x}<br>Trendline: %{y:.2f}°C<extra></extra>"
        ))

        # Retrieve the p-value for the region
        p_value = p_values_df[p_values_df["Region"] == region]["P-value"].values[0]

        # Add a label for the p-value to the legend
        fig.add_trace(go.Scatter(
            x=[x.iloc[-1]],  # Position at the last year
            y=[y.iloc[-1]],  # Position at the last temperature
            mode="markers+text",
            text=[f"P-value: {p_value:.4f}"],
            textposition="top center",
            marker=dict(size=1, color="rgba(0,0,0,0)"),  # Invisible marker
            showlegend=False
        ))

    # Customize the layout
    fig.update_layout(
        title="Temperature Changes in North and South America with Trendlines",
        xaxis_title="Year",
        yaxis_title="Temperature (°C)",
        legend_title="Regions",
        template="plotly_white",
        hovermode="x unified"
    )

    # Show the chart
    fig.show()

In [270]:
def calculate_p_values(df_combined):
    """
    Calculates linear regression and p-values for each region in the dataset.

    Parameters:
        df_combined (DataFrame): A DataFrame containing columns "Region", "Year", and "emissions_total".
        
    Returns:
        DataFrame: A new DataFrame with linear regression results, including slope, intercept, R-squared, and p-value for each region.
    """
    import pandas as pd
    from scipy.stats import linregress

    # Initialize a list to store the regression results for each region
    results = []

    # Iterate over each unique region in the DataFrame
    for region in df_combined["Region"].unique():
        # Filter the data for the current region
        region_data = df_combined[df_combined["Region"] == region]
        x = region_data["Year"].values  # Independent variable
        y = region_data["emissions_total"].values  # Dependent variable

        # Perform linear regression
        slope, intercept, r_value, p_value, std_err = linregress(x, y)

        # Append the regression results for the region to the results list
        results.append({
            "Region": region,
            "Slope": slope,  # The rate of change in emissions over time
            "Intercept": intercept,  # The estimated emissions at Year=0
            "R-squared": r_value**2,  # Coefficient of determination, indicates goodness of fit
            "P-value": p_value  # Statistical significance of the slope
        })

    # Convert the results list into a DataFrame for easier analysis and visualization
    return pd.DataFrame(results)

In [271]:
# Calculate p-values for statistical significance testing
p_values_df = calculate_p_values(df_combined)

# Display the resulting p-values and regression results
print("\nP-values and regression results:")
print(p_values_df)


P-values and regression results:
        Region         Slope     Intercept  R-squared       P-value
0  Nordamerika  5.990640e+07 -1.136031e+11    0.80692  2.011357e-27
1   Südamerika  1.580106e+07 -3.077868e+10    0.96686  5.146013e-55


In [272]:
# Aufruf der Funktion
plot_temperature_with_trendlines(df_combined, p_values_df)