In [None]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from path import Path
import numpy as np
from sqlalchemy import create_engine
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering

In [None]:
# read csv
emissions_total = pd.read_csv('methane_total_emissions.csv')
gdp_data = pd.read_csv('gdp_cleaned.csv')

In [None]:
emissions_total.head()

In [None]:
gdp_data.head()

In [None]:
merged_data = pd.merge(emissions_total, gdp_data, how='left', on=['year', 'country'])
merged_data.head()

In [None]:
merged_data.to_csv('merged_data.csv', index=False)

In [None]:
merged_data.dtypes

In [None]:
# check for null values
for column in merged_data.columns:
    print(f"column {column} has {merged_data[column].isnull().sum()} null values")

In [None]:
new_df = merged_data.drop(['country', 'sector'], axis=1)

In [None]:
# initialize model with K=5 clusters (trial and error)
def test_cluster_amount(df, df_total, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model
    
    #fit the model
    model.fit(df)
    
    #add a new class column to df
    df_total['class']=model.labels_

In [None]:
# create a scatterplot of df
test_cluster_amount(new_df, merged_data, 3)
merged_data.hvplot.scatter(x='emissions', y="GDP", by='class')

In [None]:
# plot with 3 figures
fig = px.scatter_3d(
    merged_data,
    x="year",
    y="GDP",
    z="emissions",
    color="class",
    symbol="class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(new_df)

In [None]:
# add a new class column to new_df
new_df['class']=model.labels_
merged_data['class']=model.labels_
new_df.head()

In [None]:
# graph the data
new_df.hvplot.scatter(
    x='emissions',
    y='GDP',
    hover_cols=['class'],
    by='class'
)

In [None]:
# plot with 3 figures
fig = px.scatter_3d(
    merged_data,
    x="year",
    y="GDP",
    z="emissions",
    color="class",
    symbol="class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()