In [122]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, func
import plotly.express as px
from sklearn.cluster import KMeans
import hvplot.pandas


In [123]:
# get password
from config import db_password
from config import user_name
rds_endpoint = 'methanedb.cri33yx04get.us-east-1.rds.amazonaws.com'
port = '5432'
DB_name = 'methaneAnalysis'

url = f'postgresql://{user_name}:{db_password}@{rds_endpoint}:{port}/{DB_name}'

In [124]:
# sql alchemy create_engine postgresql://{user}:{password}@{rds endpoint}:{port}/{DB name}
engine = create_engine(url)

In [125]:
Base = automap_base()
Base.prepare(engine, reflect=True)

In [126]:
# query df = pd.read_sql(query.statement, connection)

# merged table
merged_df = pd.read_sql('SELECT * FROM merged_data', engine)

# sector specific table
sector_total_df = pd.read_sql('SELECT * FROM sector_total', engine)

In [127]:
merged_df.head()

Unnamed: 0,year,country,sector,emissions,gdp
0,2018,Afghanistan,Agriculture,10.45,18.053229
1,2018,Afghanistan,Energy,67.91,18.053229
2,2018,Afghanistan,Fugitive Emissions,2.22,18.053229
3,2018,Afghanistan,Industrial Processes,0.0,18.053229
4,2018,Afghanistan,Land-Use Change and Forestry,0.0,18.053229


In [128]:
sector_total_df.head()

Unnamed: 0,year,country,sector,emissions,gdp
0,2018,Afghanistan,Total including LUCF,81.51,18.053229
1,2018,Albania,Total including LUCF,3.16,15.156432
2,2018,Algeria,Total including LUCF,49.55,174.910879
3,2018,Andorra,Total including LUCF,0.05,3.218316
4,2018,Angola,Total including LUCF,39.96,101.353231


In [129]:
# drop world from country column
merged_df = merged_df[merged_df['country'] != 'World']
sector_total_df = sector_total_df[sector_total_df['country'] != 'World']

In [130]:
# get dummies
sector_dummies = pd.get_dummies(sector_total_df, columns=['country', 'sector'])
sector_dummies.head()

Unnamed: 0,year,emissions,gdp,country_Afghanistan,country_Albania,country_Algeria,country_Andorra,country_Angola,country_Antigua and Barbuda,country_Argentina,...,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Yemen,country_Zambia,country_Zimbabwe,sector_Total including LUCF
0,2018,81.51,18.053229,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2018,3.16,15.156432,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2018,49.55,174.910879,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2018,0.05,3.218316,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2018,39.96,101.353231,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


# ML Model for Merged Dataframe

In [131]:
merged_new = merged_df.drop(['country', 'sector'], axis=1)
merged_new.head()

Unnamed: 0,year,emissions,gdp
0,2018,10.45,18.053229
1,2018,67.91,18.053229
2,2018,2.22,18.053229
3,2018,0.0,18.053229
4,2018,0.0,18.053229


Elbow Curve

In [132]:
inertia = []
k = list(range(1,11))

In [133]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(merged_new)
    inertia.append(km.inertia_)

In [134]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [135]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=2, random_state=5)
model

KMeans(n_clusters=2, random_state=5)

In [136]:
# Fitting model
model.fit(merged_new)

KMeans(n_clusters=2, random_state=5)

In [137]:
# Get the predictions
predictions = model.predict(merged_new)
print(predictions)

[0 0 0 ... 0 0 0]


In [138]:
# Add a new class column to the df
merged_df["class"] = model.labels_
merged_df.head()

Unnamed: 0,year,country,sector,emissions,gdp,class
0,2018,Afghanistan,Agriculture,10.45,18.053229,0
1,2018,Afghanistan,Energy,67.91,18.053229,0
2,2018,Afghanistan,Fugitive Emissions,2.22,18.053229,0
3,2018,Afghanistan,Industrial Processes,0.0,18.053229,0
4,2018,Afghanistan,Land-Use Change and Forestry,0.0,18.053229,0


In [139]:
# plotting the clusters 
merged_df.hvplot.scatter(x='emissions', y='gdp', by='class')


In [140]:
# plot the clusters with 3 features
fig = px.scatter_3d(
    merged_df,
    x='country',
    y='gdp',
    z='emissions',
    color='class',
    symbol='country',
    size='gdp',
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

Separate United States, China, and European Union due to being extreme outliers

In [141]:
# drop class column to start new
merged_df = merged_df.drop('class', axis=1)
merged_df.head()

Unnamed: 0,year,country,sector,emissions,gdp
0,2018,Afghanistan,Agriculture,10.45,18.053229
1,2018,Afghanistan,Energy,67.91,18.053229
2,2018,Afghanistan,Fugitive Emissions,2.22,18.053229
3,2018,Afghanistan,Industrial Processes,0.0,18.053229
4,2018,Afghanistan,Land-Use Change and Forestry,0.0,18.053229


In [142]:
data_new = merged_df[merged_df['country'] != 'United States']
data_new = data_new[data_new['country'] != 'China']
data_new = data_new[data_new['country'] != 'European Union']

data_small = data_new.drop(['country', 'sector'], axis=1)

data_small.head()

Unnamed: 0,year,emissions,gdp
0,2018,10.45,18.053229
1,2018,67.91,18.053229
2,2018,2.22,18.053229
3,2018,0.0,18.053229
4,2018,0.0,18.053229


In [143]:
inertia = []
k = list(range(1,11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(data_small)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [144]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=2, random_state=5)
model

KMeans(n_clusters=2, random_state=5)

In [145]:
# Fitting model
model.fit(data_small)

KMeans(n_clusters=2, random_state=5)

In [146]:
# Add a new class column to the df
data_new["class"] = model.labels_
data_new.head()

Unnamed: 0,year,country,sector,emissions,gdp,class
0,2018,Afghanistan,Agriculture,10.45,18.053229,0
1,2018,Afghanistan,Energy,67.91,18.053229,0
2,2018,Afghanistan,Fugitive Emissions,2.22,18.053229,0
3,2018,Afghanistan,Industrial Processes,0.0,18.053229,0
4,2018,Afghanistan,Land-Use Change and Forestry,0.0,18.053229,0


In [147]:
# plotting the clusters 
data_new.hvplot.scatter(x='gdp', y='emissions', by='class')

In [148]:
# plot the clusters with 3 features
fig = px.scatter_3d(
    data_new,
    x='country',
    y='gdp',
    z='emissions',
    color='class',
    symbol='year',
    size='gdp',
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

maybe separate based on emissions over 200 and need to add filters to the graphs for years/ sectors/ w.e would help 

# ML Model for Sector Dataframe

In [149]:
sector_new = sector_total_df.drop(['country', 'sector'], axis=1)
sector_new.head()

Unnamed: 0,year,emissions,gdp
0,2018,81.51,18.053229
1,2018,3.16,15.156432
2,2018,49.55,174.910879
3,2018,0.05,3.218316
4,2018,39.96,101.353231


In [150]:
inertia = []
k = list(range(1,11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(sector_new)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [151]:
# Initializing model with K = 2
model = KMeans(n_clusters=2, random_state=5)
model

KMeans(n_clusters=2, random_state=5)

In [152]:
# Fitting model
model.fit(sector_new)

KMeans(n_clusters=2, random_state=5)

In [153]:
# Add a new class column to the df
sector_total_df["class"] = model.labels_
sector_total_df.head()

Unnamed: 0,year,country,sector,emissions,gdp,class
0,2018,Afghanistan,Total including LUCF,81.51,18.053229,0
1,2018,Albania,Total including LUCF,3.16,15.156432,0
2,2018,Algeria,Total including LUCF,49.55,174.910879,0
3,2018,Andorra,Total including LUCF,0.05,3.218316,0
4,2018,Angola,Total including LUCF,39.96,101.353231,0


In [154]:
sector_total_df['year'] = (sector_total_df['year'] - 1999) * 1.5
sector_total_df.head()

Unnamed: 0,year,country,sector,emissions,gdp,class
0,28.5,Afghanistan,Total including LUCF,81.51,18.053229,0
1,28.5,Albania,Total including LUCF,3.16,15.156432,0
2,28.5,Algeria,Total including LUCF,49.55,174.910879,0
3,28.5,Andorra,Total including LUCF,0.05,3.218316,0
4,28.5,Angola,Total including LUCF,39.96,101.353231,0


In [155]:
# plotting the clusters 
sector_total_df.hvplot.scatter(x='gdp', y='emissions', by='class', s='year', hover_cols= ['year', 'country', 'gdp', 'emissions'])

In [157]:
# plot the clusters with 3 features
fig = px.scatter_3d(
    sector_total_df,
    x='year',
    y='gdp',
    z='emissions',
    color='class',
    symbol='class',
    size='gdp',
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [268]:
merge_cols = sector_total_df
merge_cols['combo'] = merge_cols['emissions'] / merge_cols['gdp']
merge_cols = merge_cols.drop('class', axis=1)
merge_cols.head()

Unnamed: 0,year,country,sector,emissions,gdp,combo
0,28.5,Afghanistan,Total including LUCF,81.51,18.053229,4.514982
1,28.5,Albania,Total including LUCF,3.16,15.156432,0.208492
2,28.5,Algeria,Total including LUCF,49.55,174.910879,0.283287
3,28.5,Andorra,Total including LUCF,0.05,3.218316,0.015536
4,28.5,Angola,Total including LUCF,39.96,101.353231,0.394265


In [269]:
merge = merge_cols.drop(['sector', 'emissions', 'gdp', 'year'], axis=1)
merge.head()

Unnamed: 0,country,combo
0,Afghanistan,4.514982
1,Albania,0.208492
2,Algeria,0.283287
3,Andorra,0.015536
4,Angola,0.394265


In [270]:
merge['country'] = merge['country'].astype('category')
merge = pd.get_dummies(merge, columns=['country'])

merge.head()

Unnamed: 0,combo,country_Afghanistan,country_Albania,country_Algeria,country_Andorra,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,...,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Yemen,country_Zambia,country_Zimbabwe
0,4.514982,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.208492,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.283287,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.015536,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.394265,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [271]:
inertia = []
k = list(range(1,11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(merge)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [272]:
# Initializing model with K = 2
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [273]:
# Fitting model
model.fit(merge)

KMeans(n_clusters=3, random_state=5)

In [274]:
# Add a new class column to the df
merge_cols["class"] = model.labels_
merge_cols.head()

Unnamed: 0,year,country,sector,emissions,gdp,combo,class
0,28.5,Afghanistan,Total including LUCF,81.51,18.053229,4.514982,2
1,28.5,Albania,Total including LUCF,3.16,15.156432,0.208492,0
2,28.5,Algeria,Total including LUCF,49.55,174.910879,0.283287,0
3,28.5,Andorra,Total including LUCF,0.05,3.218316,0.015536,0
4,28.5,Angola,Total including LUCF,39.96,101.353231,0.394265,0


In [275]:
# plotting the clusters 
merge_cols.hvplot.scatter(x='gdp', y='emissions', by='class', s='year', hover_cols= ['year', 'country', 'gdp', 'emissions'])
# hvplot.save(plot, 'plot.html')

In [276]:
merge_cols.corr()

Unnamed: 0,year,emissions,gdp,combo,class
year,1.0,0.016893,0.06043,-0.178598,-0.233329
emissions,0.016893,1.0,0.615639,-0.026095,-0.031268
gdp,0.06043,0.615639,1.0,-0.094221,-0.078648
combo,-0.178598,-0.026095,-0.094221,1.0,0.576662
class,-0.233329,-0.031268,-0.078648,0.576662,1.0


In [228]:
# plot the clusters with 3 features
fig = px.scatter_3d(
    merge_cols,
    x='year',
    y='gdp',
    z='emissions',
    color='class',
    symbol='class',
    size='gdp',
    width=800
)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [236]:
merge_cols.loc[merge_cols['class'] == 1]

Unnamed: 0,year,country,sector,emissions,gdp,combo,class
0,28.5,Afghanistan,Total including LUCF,81.51,18.053229,4.514982,1
1,28.5,Albania,Total including LUCF,3.16,15.156432,0.208492,1
2,28.5,Algeria,Total including LUCF,49.55,174.910879,0.283287,1
3,28.5,Andorra,Total including LUCF,0.05,3.218316,0.015536,1
4,28.5,Angola,Total including LUCF,39.96,101.353231,0.394265,1
...,...,...,...,...,...,...,...
3406,1.5,Finland,Total including LUCF,6.76,125.706652,0.053776,1
3407,1.5,France,Total including LUCF,75.38,1362.248940,0.055335,1
3408,1.5,Gabon,Total including LUCF,0.89,5.080483,0.175180,1
3409,1.5,Gambia,Total including LUCF,1.12,0.782915,1.430550,1


In [None]:
# all sectors minus total
# run a standard scaler 