In [21]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from path import Path
import numpy as np
from sqlalchemy import create_engine
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering

In [22]:
# read csv
emissions_total = pd.read_csv('methane_total_emissions.csv')
gdp_data = pd.read_csv('gdp_cleaned.csv')

In [23]:
emissions_total.head()

Unnamed: 0,year,country,sector,emissions
0,2018,Afghanistan,Total including LUCF,81.51
1,2018,Albania,Total including LUCF,3.16
2,2018,Algeria,Total including LUCF,49.55
3,2018,Andorra,Total including LUCF,0.05
4,2018,Angola,Total including LUCF,39.96


In [24]:
gdp_data.head()

Unnamed: 0,year,country,GDP
0,2018,Afghanistan,18053230000.0
1,2018,Angola,101000000000.0
2,2018,Albania,15156430000.0
3,2018,Andorra,3218316000.0
4,2018,United Arab Emirates,422000000000.0


In [25]:
merged_data = pd.merge(emissions_total, gdp_data, how='left', on=['year', 'country'])
merged_data.head()

Unnamed: 0,year,country,sector,emissions,GDP
0,2018,Afghanistan,Total including LUCF,81.51,18053230000.0
1,2018,Albania,Total including LUCF,3.16,15156430000.0
2,2018,Algeria,Total including LUCF,49.55,175000000000.0
3,2018,Andorra,Total including LUCF,0.05,3218316000.0
4,2018,Angola,Total including LUCF,39.96,101000000000.0


In [26]:
merged_data.to_csv('merged_data.csv', index=False)

In [27]:
merged_data.dtypes

year           int64
country       object
sector        object
emissions    float64
GDP          float64
dtype: object

country and sector datatypes need to be converted to numerical

In [28]:
# check for null values
for column in merged_data.columns:
    print(f"column {column} has {merged_data[column].isnull().sum()} null values")

column year has 0 null values
column country has 0 null values
column sector has 0 null values
column emissions has 0 null values
column GDP has 0 null values


does country and sector offer any insight? sector yes but country?
could we do analysis with only the total or drop the total row??

In [29]:
new_df = merged_data.drop(['country', 'sector'], axis=1)

In [30]:
# initialize model with K=5 clusters (trial and error)
def test_cluster_amount(df, df_total, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model
    
    #fit the model
    model.fit(df)
    
    #add a new class column to df
    df_total['class']=model.labels_

In [31]:
# create a scatterplot of df
test_cluster_amount(new_df, merged_data, 6)
merged_data.hvplot.scatter(x='emissions', y="GDP", by='class')

In [32]:
# plot with 3 figures
fig = px.scatter_3d(
    merged_data,
    x="year",
    y="GDP",
    z="emissions",
    color="class",
    symbol="class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [33]:
agg = AgglomerativeClustering(n_clusters=5)
model = agg.fit(new_df)

In [37]:
# add a new class column to new_df
new_df['class']=model.labels_
merged_data['class']=model.labels_
new_df.head()

Unnamed: 0,year,emissions,GDP,class
0,2018,81.51,18053230000.0,2
1,2018,3.16,15156430000.0,2
2,2018,49.55,175000000000.0,2
3,2018,0.05,3218316000.0,2
4,2018,39.96,101000000000.0,2


In [36]:
# graph the data
new_df.hvplot.scatter(
    x='emissions',
    y='GDP',
    hover_cols=['class'],
    by='class'
)

In [39]:
# plot with 3 figures
fig = px.scatter_3d(
    merged_data,
    x="year",
    y="GDP",
    z="emissions",
    color="class",
    symbol="class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()