In [45]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy.stats import pearsonr, spearmanr
import plotly.graph_objects as go

import seaborn as sns
from ipywidgets import interact, Dropdown
from IPython.display import display

### Pre-processing

In [2]:
# Import data
df_c40 = pd.read_excel('./C40_GPC_Database.xlsx', sheet_name='GHG Dashboard Data - Inventory')

In [3]:
# select the interested columns
df_c40 = df_c40[['City','Country','Region','Boundary','Year_calendar','Population','Area','GDP','I.1.1','I.1.2','I.1.3','I.2.1',
                 'I.2.2','I.2.3','I.3.1','I.3.2','I.3.3','I.4.1','I.4.2','I.4.3','I.4.4','I.5.1','I.5.2','I.5.3','I.6.1',
                 'I.6.2','I.6.3','I.7.1','I.8.1','II.1.1','II.1.2','II.1.3','II.2.1','II.2.2','II.2.3','II.3.1','II.3.2',
                 'II.3.3','II.4.1','II.4.2','II.4.3','II.5.1','II.5.2','II.5.3','III.1.1','III.1.2','III.1.3','III.2.1',
                 'III.2.2','III.2.3','III.3.1','III.3.2','III.3.3','III.4.1','III.4.2','III.4.3','IV.1','IV.2','V.1','V.2',
                 'V.3','VI.1']]

In [4]:
# re-format the data
df_c40 = pd.melt(df_c40, id_vars=['City','Country','Region','Boundary','Year_calendar','Population','Area','GDP'], var_name='gpc_refno', value_name='emissions')

In [5]:
# standarize the notation keys
df_c40 = df_c40.replace({'IE (I.1.1)': 'IE', 'NO; IE': 'NO', 'NE; NE': 'NE', 'IE (II.1.1)': 'IE', 'NO ': 'NO'})

In [6]:
def is_non_numeric(value):
    try:
        float(value)
        return False  # Numeric values
    except ValueError:
        return True   # Non-numeric values

In [7]:
df = df_c40.copy()
# keeping only numeric values
df['emissions'] = pd.to_numeric(df['emissions'], errors='coerce')

In [8]:
# Filter for negative emissions
negative_values_mask = df['emissions'] < 0

negative_emissions_df = df[negative_values_mask]

negative_emissions_df['gpc_refno'].unique()

array(['V.2', 'V.3', 'VI.1'], dtype=object)

In [9]:
negative_emissions_df[negative_emissions_df['gpc_refno'] == 'VI.1']

Unnamed: 0,City,Country,Region,Boundary,Year_calendar,Population,Area,GDP,gpc_refno,emissions
17580,Los Angeles,USA,North America,Administrative boundary of a local government,2020,3898474.0,1214.0,710893266000,VI.1,-80311.522782
17581,Los Angeles,USA,North America,Administrative boundary of a local government,2019,3979576.0,1214.0,659328437000,VI.1,-117026.349625
17582,Los Angeles,USA,North America,Administrative boundary of a local government,2018,3990456.0,1214.0,703781312000,VI.1,-34941.157948


In [10]:
# drop negative values and NAN values
clean_df = df[~negative_values_mask]
clean_df = clean_df.dropna(subset=['emissions'])

# replace a non-numeric value in the GDP column
clean_df['GDP'] = clean_df['GDP'].replace({'$763,955,224,976': 763955224976})

  clean_df['GDP'] = clean_df['GDP'].replace({'$763,955,224,976': 763955224976})


In [11]:
# deleting values = 0 in the emissions column
clean_df = clean_df[clean_df['emissions'] != 0]

### Notation Keys

In [12]:
len(df_c40)

17766

In [13]:
# Filter for non-numeric emissions and calculate value counts
non_numeric_counts = df_c40[df_c40['emissions'].apply(is_non_numeric)]['emissions'].value_counts()

non_numeric_counts_df = non_numeric_counts.reset_index()
non_numeric_counts_df.columns = ['Notation Key', 'Count'] 

fig = px.bar(non_numeric_counts_df, 
             x='Notation Key', 
             y='Count', 
             labels={'Notation Key': 'Notation Key', 'Count': 'Count'},
             template="plotly_white")

fig.show()

### Test: Stationary Energy

In [27]:
se_df = clean_df[clean_df['gpc_refno'].str.startswith('I.')]

In [28]:
# Analize the distribution of the emissions
fig = px.histogram(se_df, 
                   x='emissions', 
                   nbins=200, 
                   labels={'emissions': 'Emissions'}, 
                   template="plotly_white")

# Customize the layout
fig.update_layout(
    xaxis_title="Emissions",
    yaxis_title="Frequency",
    bargap=0.1  # Adjust the gap between bars
)

# Show the figure
fig.show()

Since emissions are log-normally distributed, apply a log transformation to stabilize variance and normalize the data

In [29]:
se_df['Log_Emissions'] = np.log(se_df['emissions'] + 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
fig = px.histogram(se_df, 
                   x='Log_Emissions', 
                   nbins=200, 
                   labels={'emissions': 'Emissions'}, 
                   template="plotly_white")

# Customize the layout
fig.update_layout(
    xaxis_title="Emissions",
    yaxis_title="Frequency",
    bargap=0.1  # Adjust the gap between bars
)

# Show the figure
fig.show()

**Correlation Coefficients and Distribution of Emissions**

Use of correlation coefficients to quantify the relationships between `emissions` and `other variables` (such as Population, GDP, and Area)

1. **Pearson Correlation Coefficient**: Measures the linear relationship between two continuous variables.
- Log Transformation is needed to stabilize variance and approximate normality
- Benefits: highly effective if the relationship is genuinely linear after transformation
2. **Spearman Rank Correlation Coefficient**: non-parametric measure that assesses the monotonic relationship between two variables by converting them to ranks
- No specific distribution required
- The relationship between variables is monotonic (either consistently increasing or decreasing), but not necessarily linear
- Less sensitive to outliers and non-linear relationships
- Captures any monotonic relationship, whether linear or not

In [21]:
# Pearson Correlation on Log-Transformed Data
pearson_corr = se_df[['Log_Emissions', 'Population', 'GDP', 'Area']].corr(method='pearson')
spearman_corr = se_df[['emissions', 'Population', 'GDP', 'Area']].corr(method='spearman')

In [22]:
pearson_corr

Unnamed: 0,Log_Emissions,Population,GDP,Area
Log_Emissions,1.0,0.139898,0.111957,0.064024
Population,0.139898,1.0,0.344252,0.294073
GDP,0.111957,0.344252,1.0,0.042203
Area,0.064024,0.294073,0.042203,1.0


In [23]:
spearman_corr

Unnamed: 0,emissions,Population,GDP,Area
emissions,1.0,0.234263,0.157223,0.154105
Population,0.234263,1.0,0.325103,0.680165
GDP,0.157223,0.325103,1.0,0.198743
Area,0.154105,0.680165,0.198743,1.0


## Correlations

### Preparing the data

In [51]:
variables = ['Population', 'GDP', 'Area']

In [52]:
clean_df['Log_Emissions'] = np.log(clean_df['emissions'] + 1)

In [57]:
correlation_results = []

grouped = clean_df.groupby('gpc_refno')

for subsector, group in grouped:
    # check if the group has enough data points
    if len(group) < 10:
        # skip subsectors with insufficient data
        continue
    
    for var in variables:
        # Pearson Correlation on Log-Transformed Emissions
        pearson_corr, pearson_p = pearsonr(group['Log_Emissions'], group[var])
        
        # Spearman Correlation on Raw Emissions
        spearman_corr, spearman_p = spearmanr(group['emissions'], group[var])
        
        correlation_results.append({
            'Subsector': subsector,
            'Variable': var,
            'Pearson_Correlation': pearson_corr,
            'Pearson_p-value': pearson_p,
            'Spearman_Correlation': spearman_corr,
            'Spearman_p-value': spearman_p
        })

corr_df = pd.DataFrame(correlation_results)

In [58]:
def categorize_correlation(r):
    """
    Categorize the correlation coefficient based on absolute value.
    """
    abs_r = abs(r)
    if 0.00 <= abs_r < 0.10:
        return "Negligible or No Correlation"
    elif 0.10 <= abs_r < 0.30:
        return "Weak Correlation"
    elif 0.30 <= abs_r < 0.50:
        return "Moderate Correlation"
    elif 0.50 <= abs_r < 0.70:
        return "Strong Correlation"
    elif 0.70 <= abs_r < 0.90:
        return "Very Strong Correlation"
    elif 0.90 <= abs_r <= 1.00:
        return "Almost Perfect Correlation"
    else:
        return "Undefined"

# Apply the function to Pearson and Spearman correlations
corr_df['Pearson_Label'] = corr_df['Pearson_Correlation'].apply(categorize_correlation)
corr_df['Spearman_Label'] = corr_df['Spearman_Correlation'].apply(categorize_correlation)

In [60]:
df_long = corr_df.melt(
    id_vars=['Subsector', 'Variable'],
    value_vars=['Pearson_Label', 'Spearman_Label'],
    var_name='Correlation_Type',
    value_name='Correlation_Strength'
)

# Clean up the Correlation_Type column
df_long['Correlation_Type'] = df_long['Correlation_Type'].str.replace('_Label', '')

### Analysis and Plots

In [63]:
grouped = clean_df.groupby('gpc_refno')

unique_subsectors = sorted(clean_df['gpc_refno'].unique())

# Create the subsector dropdown widget
subsector_dropdown = Dropdown(
    options=unique_subsectors,
    value=unique_subsectors[0],
    description='Subsector:',
    disabled=False,
)

# Create the variable dropdown widget
variable_dropdown = Dropdown(
    options=variables,
    value=variables[0],
    description='Variable:',
    disabled=False,
)

def plot_subsector_variable(subsector, variable):
    # Retrieve the data for the selected subsector
    group = grouped.get_group(subsector)
    
    plt.figure(figsize=(10, 6))
    
    # Scatter plot of the selected variable vs Log_Emissions
    sns.scatterplot(data=group, x=variable, y='Log_Emissions', alpha=0.5)
    
    # Regression line
    sns.regplot(data=group, x=variable, y='Log_Emissions', scatter=False, color='red')
    
    plt.title(f'Log_Emissions vs {variable} for Subsector {subsector}')
    plt.xlabel(variable)
    plt.ylabel('Log(Emissions)')
    plt.show()

# Use interact to link both dropdowns to the plotting function
interact(plot_subsector_variable, subsector=subsector_dropdown, variable=variable_dropdown)

interactive(children=(Dropdown(description='Subsector:', options=('I.1.1', 'I.1.2', 'I.1.3', 'I.2.1', 'I.2.2',…

<function __main__.plot_subsector_variable(subsector, variable)>

**Purpose of Correlation Analysis**

To determine how emissions in each subsector relate to key variables (Population, GDP, Area), which informs how to normalize emissions for benchmarking.

**In the Context of Benchmarking Objectives**
- Tailored Benchmarks: By aligning benchmarks with variables that strongly influence emissions, you ensure that comparisons are meaningful and account for key determinants
- Avoiding Misleading Comparisons: Using weakly correlated variables could result in benchmarks that `don't accurately reflect` the underlying dynamics, leading to ineffective assessments

**High Correlation**: 
- When a subsector shows a high correlation with a specific variable (e.g., Population), it indicates a strong relationship between that variable and emissions in that subsector.
- *Implication for Benchmarking*: A high correlation suggests that the variable can be a reliable factor to normalize emissions, facilitating meaningful comparisons across different cities or regions.

**Low Correlation**:
- When a subsector exhibits low correlation with the variables considered (e.g., GDP, Area), it implies that these factors do not strongly influence emissions in that subsector.
- *Implication for Benchmarking*: 
    - Low correlation indicates that normalizing emissions based on these variables may not yield accurate or meaningful benchmarks. It also suggests that other unexamined factors may be driving emissions in these subsectors.
    - Recognizing that certain subsectors do not correlate strongly with the primary variables (Population, GDP, Area) highlights the need for alternative factors or more nuanced models for accurate benchmarking.

In [54]:
fig = go.Figure()

for var in corr_df['Variable'].unique():
    # Filter data by variable
    df_var = corr_df[corr_df['Variable'] == var]
    
    # Add trace for Pearson Correlation 
    fig.add_trace(go.Scatter(
        x=df_var['Subsector'],
        y=df_var['Pearson_Correlation'],
        mode='markers',
        name=f'{var} (Pearson)',
        marker=dict(symbol='circle'),
        visible=True  
    ))
    
    # Add trace for Spearman Correlation
    fig.add_trace(go.Scatter(
        x=df_var['Subsector'],
        y=df_var['Spearman_Correlation'],
        mode='markers',
        name=f'{var} (Spearman)',
        marker=dict(symbol='circle'),
        visible=False  
    ))

fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{'visible': [True if 'Pearson' in trace.name else False for trace in fig.data]}],
                    label="Pearson Correlation",
                    method="restyle"
                ),
                dict(
                    args=[{'visible': [True if 'Spearman' in trace.name else False for trace in fig.data]}],
                    label="Spearman Correlation",
                    method="restyle"
                )
            ]),
            direction="down",
            showactive=True,
            x=0.17,
            y=1.15,
            xanchor="left",
            yanchor="top"
        )
    ]
)

fig.update_layout(
    title="Correlation Plot by Subsector and Variable",
    xaxis_title="Subsector (GPC Reference)",
    yaxis_title="Correlation Value",
    legend_title="Variable",
    template="plotly_white"
)

fig.show()

In [62]:
color_map = {
    "Negligible or No Correlation": "#d3d3d3",    # Light Grey
    "Weak Correlation": "#ffcc00",                # Yellow
    "Moderate Correlation": "#ff9900",            # Orange
    "Strong Correlation": "#ff6600",              # Dark Orange
    "Very Strong Correlation": "#cc0000",         # Red
    "Almost Perfect Correlation": "#660000"       # Dark Red
}

# Assign colors
df_long['Color'] = df_long['Correlation_Strength'].map(color_map)

fig = px.bar(
    df_long,
    x='Subsector',
    y='Correlation_Strength',
    color='Correlation_Strength',
    facet_row='Correlation_Type',
    category_orders={'Correlation_Strength': [
        "Negligible or No Correlation",
        "Weak Correlation",
        "Moderate Correlation",
        "Strong Correlation",
        "Very Strong Correlation",
        "Almost Perfect Correlation"
    ]},
    labels={
        'Correlation_Strength': 'Correlation Strength',
        'Subsector': 'Subsector',
        'Correlation_Type': 'Correlation Type'
    },
    title='Correlation Strengths by Subsector and Variable',
    hover_data=['Variable']
)

fig.update_traces(marker=dict(color=df_long['Color']))

fig.update_layout(
    showlegend=False,
    height=800,
    bargap=0.2,
    bargroupgap=0.1
)

fig.show()