# Clustering Data

### Loading in data:

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from typing import Set, Tuple, List

In [2]:
def obtain_country_and_policies(df : pd.DataFrame) -> Tuple[Set, Set]:
    county_nodes = set(df["ISO2"].unique())
    policy_nodes = set(df["CTS_Name"].unique())
    
    return (county_nodes, policy_nodes)

# Load in Subsidy data
subsidy = pd.read_csv('input/policy/Fossil_Fuel_Subsidies.csv', index_col=0, keep_default_na=False, na_values="")
sub_country_nodes, sub_policy_nodes = obtain_country_and_policies(subsidy)
# print("Number of Countries:", len(subsidy_country_nodes))
# print("Number of Subsidies:", len(subsidy_policy_nodes))


# Load in green bonds
green_bonds = pd.read_csv("input/policy/Green_Bonds.csv", index_col=0, keep_default_na=False, na_values="")
# Filtered Down to countries
green_bonds = green_bonds[np.logical_not(green_bonds["ISO2"].isna())]
bond_country_nodes, bond_policy_nodes = obtain_country_and_policies(green_bonds)


# Load in Taxes
taxes = pd.read_csv("input/policy/Environmental_Taxes.csv", index_col=0, keep_default_na=False, na_values="")
tax_country_nodes, tax_policy_nodes = obtain_country_and_policies(taxes)


# Load in Expenditures:
expenditures = pd.read_csv("input/policy/Environmental_Protection_Expenditures.csv", index_col=0, keep_default_na=False, na_values="")
exp_country_nodes, exp_policy_nodes = obtain_country_and_policies(expenditures)

### Creating Multigraph

We will be constructing a dataset that includes all policy types and all 

We will create a multigraph that has attributes for each edge and node allowing us to develop our entire graph at once. Each node will have an attribute called policy, which denotes what category of policy that node is in. For country nodes, the policy attribute can have multiple values as a country can participate in multiple different policy types, but for the policy nodes there can only be one value for that policy attribute

Edges will have a single attribute, the year. Given a year, the graph will return the edges and connection between policies and countries in a given year.

Now we need to add the nodes to the graph. We will first add all the country nodes with each node having an attribute called policy that lists what policies this country have implemented. With this list we can filter down the nodes to countries that have or are currently investing in a specific policy. 

In [3]:
G = nx.MultiGraph()

# Territories are included in this.
all_countries = tax_country_nodes | exp_country_nodes | bond_country_nodes | sub_country_nodes
country_node_attributes = {}

for country in all_countries:
    policies = []
    if country in tax_country_nodes:
        policies.append("Environmental Taxes")
    if country in exp_country_nodes:
        policies.append("Environmental Protection Expenditures")
    if country in bond_country_nodes:
        policies.append("Green Bonds")
    if country in sub_country_nodes:
        policies.append("Environmental Subsidies")
        
    country_node_attributes[country] = {"bipartite" : 0, "policies" : policies}
    
G.add_nodes_from(all_countries)
nx.set_node_attributes(G,country_node_attributes)
    
# Now we will add the policy nodes using a very similar approach:
all_policies = tax_policy_nodes | exp_policy_nodes  | bond_policy_nodes  | sub_policy_nodes
policy_node_attributes = {}

for policy in all_policies:
    policies = []
    if policy in tax_policy_nodes:
        policies.append("Environmental Taxes")
    elif policy in exp_policy_nodes:
        policies.append("Environmental Protection Expenditures")
    elif policy in bond_policy_nodes:
        policies.append("Green Bonds")
    elif policy in sub_policy_nodes:
        policies.append("Environmental Subsidies")
        
    policy_node_attributes[policy] = {"bipartite" : 1, "policies" : policies}
    
G.add_nodes_from(all_policies)
nx.set_node_attributes(G, policy_node_attributes)

Now we need to add the edges between all the policies and their nodes

In [6]:
def add_edges(G : nx.MultiGraph, df : pd.DataFrame, countries : Set[str], policies : Set[str]) -> List[int]:
    """Adds the edge data from the dataframe into the graph. Returns the list of years that this policy is invested in
    
    Returns a list of years where there were policies implemented
    """
    
    # obtain the columns that correspond to the year
    years = df.filter(regex= "F\d\d\d\d", axis = 1).columns
    df_temp = df.set_index(keys = ["ISO2", "CTS_Name"])
    return_years = []
    
    edge_key = 0
    for year in years:
        year_edge_list = []
        for ind in df_temp.index:
            country = ind[0]
            policy = ind[1]
            weight = df_temp.loc[ind].at[year]
            
            if weight > 0:
                edge_data = {"weight" : weight, "year" : int(year[1:])}
                
                edge = (country, policy, edge_key, edge_data) #create edge
                year_edge_list.append(edge)
                edge_key += 1

        # Take out years with no policies
        if not (len(year_edge_list) == 0):         
            G.add_edges_from(year_edge_list)
            return_years.append(int(year[1:]))
    return return_years

taxes_sub = taxes[taxes.Unit == "Percent of GDP"].fillna(0)
tax_years = add_edges(G, taxes_sub, tax_country_nodes, tax_policy_nodes)

green_bonds_sub = green_bonds.fillna(0)
bond_years = add_edges(G, green_bonds_sub, bond_country_nodes, bond_policy_nodes)

subsidy_sub = subsidy_sub = subsidy[subsidy["Unit"] == "Percent of GDP"]
subsidy_years = add_edges(G, subsidy_sub, sub_country_nodes, sub_policy_nodes)


exp_sub = expenditures[expenditures["Unit"] == "Percent of GDP"]
exp_years = add_edges(G, exp_sub, exp_country_nodes, exp_policy_nodes)



Now that we have created a graph containing all the data, lets create methods to subgraph this multigraph so we can analysis specific policies or specific years

In [7]:
def return_subgraph_year(G : nx.MultiGraph, year : int) -> nx.MultiGraph:
    """Helper function to return the subgraph of G for the specificed year"""
    
    # If a node has no edge, it will be removed. We don't want this so we make a copy of the nodes and will add them back into the graph latter.
    sub_graph = nx.MultiGraph()
    
    # Add nodes to subgraph
    nodes = G.nodes(data=True)
    sub_graph.add_nodes_from(nodes)
    
    # Add edges to subgraph
    edges = [(u, v, key, data) for u, v, key, data in G.edges(keys=True, data=True) if data["year"] == year]
    sub_graph.add_edges_from(edges)
    
    return sub_graph

def return_subgraph_policy(G : nx.MultiGraph, filter_policy : str) -> nx.MultiGraph:
    """Helper function to return the subgraph of G for the specificed year"""
    
    sub_graph = nx.MultiGraph()
    
    # Adds subset of Nodes that are affiliated with a specific policy (Countries that take part in a type of policy)
    nodes = [(node, data) for node, data in G.nodes(data=True) if filter_policy in data["policies"]]
    sub_graph.add_nodes_from(nodes)
    
    # Adds Edges between nodes 
    node_names = sub_graph.nodes()
    edges = [(node_country, node_policy, key, data) for node_country, node_policy, key, data in G.edges(keys=True, data=True) if node_policy in node_names]
    sub_graph.add_edges_from(edges)

    return sub_graph

### Create Bidirectional Lookup table

In [8]:
# Creates a Two-way look up table
class TwoWayDict(dict):
    
    def __setitem__(self, key, value):
        # Remove any previous connections with these values
        if key in self:
            del self[key]
        if value in self:
            del self[value]
        dict.__setitem__(self, key, value)
        dict.__setitem__(self, value, key)

    def __delitem__(self, key):
        dict.__delitem__(self, self[key])
        dict.__delitem__(self, key)

    def __len__(self):
        """Returns the number of connections"""
        return dict.__len__(self) // 2
    
    def from_dict(self, user_dict):
        for key, value in user_dict.items():
            dict.__setitem__(self, key, value)
            dict.__setitem__(self, value, key)
    

In [54]:
pd.read_csv("input/continents2.csv").columns

Index(['name', 'alpha-2', 'alpha-3', 'country-code', 'iso_3166-2', 'region',
       'sub-region', 'intermediate-region', 'region-code', 'sub-region-code',
       'intermediate-region-code'],
      dtype='object')

In [55]:
# Create Country Name Lookup Table
country_name_conversion = pd.read_csv("input/continents2.csv")
country_name_conversion.rename(columns = {"name" : "Country Name", "alpha-2" : "ISO2", "alpha-3" : "ISO3", }, inplace = True)

two_way_country_lookup = TwoWayDict()
oneway_lookup = country_name_conversion.set_index("Country Name")["ISO2"].to_dict()
two_way_country_lookup.from_dict(oneway_lookup)

In [10]:
policy_G = return_subgraph_policy(G, "Green Bonds")
year_G = return_subgraph_year(policy_G, 2010)

### Clusterings

In [80]:
# TODO: Create the bine embeddings for specific graph and all the nodes within it. 
# TODO: Add functionality to create embeddings from a graph
# TODO: Add in functionality to check if these embeddings already exist
 
countries = (country for country, data in year_G.nodes(data = True) if data["bipartite"] == 0)


In [14]:
green_bond_node_mapping = {}

# Mapping for contries
i = 0
for country in green_bonds["ISO2"].unique():
    green_bond_node_mapping[country] = f"u{i}"
    i += 1

# Mapping for policies
j = 0
for policy in green_bonds["CTS_Name"].unique():
    green_bond_node_mapping[policy] = f"i{j}"
    j += 1

# Reverses the mapping to be id to name
inv_green_bond_mapping = {v: k for k, v in green_bond_node_mapping.items()}

In [None]:
# Create graphs BINE can read
for year in green_bond_years:
    B.remove_edges_from(B.edges)
    B.add_weighted_edges_from(green_bond_dict[year])

    # Relabels our graph from name to id to work on BINE code
    B_integer = nx.relabel_nodes(B, green_bond_node_mapping)

    fileName = f"BiNE-master/data/greenbonds/greenbonds_{year}_edgelist.dat"
    nx.write_weighted_edgelist(B_integer, fileName, delimiter = "\t")

In [16]:
bine_embeddings_bonds_dict = {}

for year in bond_years:

    out_mat = []
    cols = []
    countries = []

    # This code is only opening the BINE embeddings for a specific year.
    with open(f"BiNE-master/data/greenbonds/vectors_u_{year}.dat", "r") as f:
        for line in f:
            data = line.split()

            # Parses what country our embedding is of.
            countries.append(data[0])

            # Obtains the embedding for country
            values = [float(elt) for elt in data[1:]]

            # Reshapes the embedding
            col = np.array(values).reshape((len(values),1))
            cols.append(col)

        out_mat = np.concatenate(cols, axis =1)

    # Changes ids of countries to their actual country names
    country_names = [inv_green_bond_mapping[country] for country in countries]

    pandas_bine_embeddings_bonds = pd.DataFrame(out_mat, columns = country_names)

    bine_embeddings_bonds_dict[year] = pandas_bine_embeddings_bonds

In [17]:
pandas_bine_embeddings_bonds = bine_embeddings_bonds_dict[2021]
pandas_bine_embeddings_bonds

Unnamed: 0,TW,TH,UA,GB,SG,SK,ZA,ES,SE,CH,...,GG,HU,FI,FR,GE,NZ,LI,IS,IN,PH
0,0.360465,0.191613,0.37499,0.366984,0.326026,0.019438,0.300085,0.437258,0.177176,0.275612,...,0.435629,0.174318,0.399621,0.825742,0.219181,0.6114,0.07463,0.589186,0.217322,0.411551
1,0.444382,0.359464,0.514755,0.412164,0.370195,0.099489,0.357959,0.582137,0.595769,0.1027,...,0.095939,0.136924,0.089223,0.473211,0.300738,0.110892,0.141813,0.132007,0.531118,0.006947
2,0.291791,0.462747,0.215559,0.288005,0.097702,0.224796,0.267987,0.13899,0.152244,0.186718,...,0.515764,0.472511,0.494365,0.488093,0.21687,0.022645,0.310646,0.056521,0.129541,0.341459
3,0.18916,0.089318,0.445759,0.600941,0.465359,0.465045,0.389689,0.474116,0.54386,0.312516,...,0.286948,0.119323,0.113977,0.212222,0.467627,0.501111,0.408074,0.236087,0.150022,0.477939
4,0.202341,0.26524,0.179494,0.730669,0.125221,0.205035,0.33307,0.624475,0.24482,0.195991,...,0.083416,0.247145,0.507063,0.302105,0.163908,0.388923,0.173432,0.147939,0.114294,0.468409
5,0.297113,0.009597,0.318254,0.596196,0.079796,0.483082,0.098642,0.412613,0.248311,0.400416,...,0.258407,0.056514,0.177707,0.391584,0.132818,0.110197,0.199281,0.088239,0.154224,0.477313
6,0.248897,0.310248,0.44367,0.543106,0.305086,0.258153,0.37934,0.368601,0.623421,0.230195,...,0.291302,0.342805,0.436529,0.63533,0.569585,0.234954,0.377983,0.355192,0.393546,0.17408
7,0.421303,0.441547,0.237482,0.344165,0.560335,0.330991,0.323784,0.249185,0.372659,0.531087,...,0.511372,0.451836,0.435732,0.437856,0.027417,0.2408,0.423283,0.310697,0.439654,0.098117
8,0.406833,0.429686,0.178031,0.42918,0.073603,0.373213,0.424688,0.21343,0.231858,0.396977,...,0.223349,0.445486,0.22432,0.56623,0.461801,0.062713,0.322293,0.161961,0.286096,0.017653
9,0.277664,0.351197,0.117629,0.47323,0.547377,0.424705,0.185385,0.348375,0.333194,0.414905,...,0.114262,0.458137,0.10956,0.326086,0.259462,0.379224,0.517734,0.59149,0.596646,0.138657


In [94]:
from sklearn.manifold import TSNE

pandas_bine_embeddings_bonds = bine_embeddings_bonds_dict[2021]
X = pandas_bine_embeddings_bonds.transpose()
tsne = TSNE(perplexity=15, n_iter=10000)

tsne.fit(X)



In [95]:
import plotly.express as px
import nbformat


x = tsne.embedding_[:,0]
y = tsne.embedding_[:,1]
labels = X.index

df = pd.DataFrame(tsne.embedding_).set_index(labels)
df.rename({0:"x", 1:"y"}, axis=1, inplace=True)

country_to_region = country_name_conversion[["Country Name","region"]].set_index("Country Name").to_dict()["region"]

country_names = []
regions = []
for ISO2 in X.index:
    country_names.append(two_way_country_lookup[ISO2])
for country in country_names:
  regions.append(country_to_region[country])
    
df["country_name"] = country_names
df["region"] = regions




fig = px.scatter(df, x = "x", y = "y", color = "region", hover_data = ["country_name"], width=600, height=500)
fig.update_layout(title = "T-SNE Distribution")

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

fig.show()
