# Quality testing design
I will test the quality of an embedding both in terms of the structural properties of the nodes and in terms of the development variables.

In [2]:
import os
os.chdir("../..") # Change to parent directory
from struc2vec.src.Helpers import *
os.chdir("structuralNetworkMigration")

In [3]:
import statsmodels.api as sm
import numpy as np
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis
import re


In [5]:
# df_emb_4 = pd.read_excel("data/embeddings/all_nw500_1000_wl80_vs75_pca4.xlsx")[["Node","Embedding"]]
# df_emb_3 = pd.read_excel("data/embeddings/all_nw500_1000_wl80_vs75_pca3.xlsx")[["Node","Embedding"]]

In [6]:
df_emb_3 = pd.read_csv("data/embeddings/v2all_nw500_1000_wl80_vs75_pca3.csv")

In [8]:
df_emb_3.columns = ["Node","Embedding"]

In [7]:
G = nx.read_gexf("data/Graphs/0_directed.gexf")

## Predicting structural properties
To test the quality of the classification, I make a linear regression for dummy variables of the classifications as predictions of each structural property. This will give an estimate of the variance within each classification.

In [9]:
# df_meta_4, df_agg_4 = get_meta_data(G, df_emb_4["Node"].values, df_emb_4["Embedding"].values)
df_meta_3, df_agg_3 = get_meta_data(G, df_emb_3["Node"].values, df_emb_3["Embedding"].values)

In [10]:
# X_4 = pd.get_dummies(df_meta_4["Embedding"], dtype=int, drop_first=True)
X_3 = pd.get_dummies(df_meta_3["Embedding"],  dtype=int, drop_first=True)

In [42]:
struc_reg_4 = {}
for struc_prop in list(df_meta_4.columns[2:]):
    y = df_meta_4[struc_prop]
    X_4 = sm.add_constant(X_4)
    model = sm.OLS(y,X_4)
    results = model.fit()
    struc_reg_4[struc_prop] = results

In [11]:
struc_reg_3 = {}
for struc_prop in list(df_meta_3.columns[2:]):
    y = df_meta_3[struc_prop]
    X_3 = sm.add_constant(X_3)
    model = sm.OLS(y,X_3)
    results = model.fit()
    struc_reg_3[struc_prop] = results

In [44]:
list_of_df_4 = []

for struc_prop, result in struc_reg_4.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(struc_prop,"params"), (struc_prop,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[struc_prop,"params"] = params
    df[struc_prop,"pvalues"] = pvalues

    list_of_df_4.append(df)



In [12]:
list_of_df_3 = []

for struc_prop, result in struc_reg_3.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(struc_prop,"params"), (struc_prop,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[struc_prop,"params"] = params
    df[struc_prop,"pvalues"] = pvalues

    list_of_df_3.append(df)

In [13]:
pd.concat(list_of_df_3, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,Clustering,Clustering,Betweenness_centrality,Betweenness_centrality,Closeness_centrality,Closeness_centrality,InDegree,InDegree,OutDegree,OutDegree,Average_weight,Average_weight
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,0.484,0.0,0.009,0.0,0.623,0.0,84.173,0.0,52.704,0.0,4.055,0.426
1,0.232,0.0,-0.009,0.0,-0.392,0.0,-80.44,0.0,-28.21,0.0,25.135,0.001
2,0.167,0.0,-0.008,0.0,-0.143,0.0,-67.044,0.0,-21.404,0.0,1.455,0.845


In [47]:
# pd.concat(list_of_df_3, axis=1).apply(lambda x:round(x, 3)).to_excel("testing/structural_indicators.xlsx")

## Predicting development variables

In [20]:
df_hdi = pd.read_excel("data/metadata_country/metadata_country_3.xlsx").drop(["Embedding_3","Embedding_4"], axis=1)

In [22]:
df_hdi = df_emb_3.merge(df_hdi, on="Node", how="left")

In [24]:
df_GDP = pd.read_excel("data/metadata_country/GDP_2015.xlsx")[["Country Code", 2015]]

In [25]:
df_GDP.columns = ["Node", "GDP"]

In [26]:
df_GDP["GDP"] = [float(x) if x != ".." else np.nan for x in df_GDP["GDP"]]

In [27]:
df_GDP.isnull().sum()

Node    0
GDP     8
dtype: int64

In [28]:
df_hdi.isnull().sum()

Node                 0
Embedding            0
child_mortality    216
eys                 35
gnipc               35
hdi                 36
le                  33
mmr                 44
mys                 36
dtype: int64

In [29]:
df_hdi = df_hdi.merge(df_GDP, on = "Node")

In [31]:
hdi_mean = df_hdi.groupby("Embedding")["hdi"].mean()
index_min = hdi_mean[hdi_mean == min(hdi_mean)].index.values[0]

In [35]:
def getHDITable(df_hdi,  hdi_list =  ['child_mortality', 'eys', 'gnipc', 'hdi', 'le', 'mmr', 'mys','GDP']):
    hdi_mean = df_hdi.groupby("Embedding")["hdi"].mean()
    index_min = hdi_mean[hdi_mean == min(hdi_mean)].index.values[0]

    hdi_reg = {}
    for hdi_stat in list(hdi_list):
        df = df_hdi.dropna(subset=hdi_stat)
        y = df[hdi_stat]
        X_hdi = pd.get_dummies(df["Embedding"],dtype=int, drop_first=True)
        X_hdi = sm.add_constant(X_hdi)
        model = sm.OLS(y,X_hdi)
        results = model.fit()
        hdi_reg[hdi_stat] = results

    list_of_df = []

    for hdi_stat, result in hdi_reg.items():
        params = result.params
        pvalues = result.pvalues
        index_col = pd.MultiIndex.from_tuples([(hdi_stat,"params"), (hdi_stat,"pvalues")])
        df = pd.DataFrame(columns=index_col)
        df[hdi_stat,"params"] = params
        df[hdi_stat,"pvalues"] = pvalues

        list_of_df.append(df)
    
    return pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

In [36]:
list_of_hdi = ['hdi','GDP', 'gnipc', 'eys', 'mys']

In [38]:
hdi_stat_3 = getHDITable(df_hdi, hdi_list=list_of_hdi)

In [39]:
# hdi_stat_4 = getHDITable(df_hdi, n_emb="4")

In [40]:
indicators = {
"eys":"Expected Years of Schooling (years)",
"hdi":"Human Development Index (value)",
"le":"Life Expectancy at Birth (years)",
"mys":"Mean Years of Schooling (years)",
"mmr":"Maternal Mortality Ratio (deaths per 100,000 live births)",
"child_mortality":"Child mortality (%)",
"gnipc":"Gross National Income Per Capita (2017 PPP$)",
"GDP":"GDP"
}

In [41]:
index_tubles = []
for layer0, layer1 in hdi_stat_3.columns:
    index_tubles.append((indicators[layer0],layer1))

In [42]:
new_index = pd.MultiIndex.from_tuples(index_tubles)

In [43]:
hdi_stat_3.columns = new_index

In [44]:
hdi_stat_3

Unnamed: 0_level_0,Human Development Index (value),Human Development Index (value),GDP,GDP,Gross National Income Per Capita (2017 PPP$),Gross National Income Per Capita (2017 PPP$),Expected Years of Schooling (years),Expected Years of Schooling (years),Mean Years of Schooling (years),Mean Years of Schooling (years)
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,0.802,0.0,840766600000.0,0.0,30306.561,0.0,14.897,0.0,10.043,0.0
1,-0.121,0.0,-745737700000.0,0.005,-18278.596,0.0,-2.144,0.0,-1.447,0.007
2,-0.17,0.0,-817983600000.0,0.002,-18774.041,0.0,-3.054,0.0,-3.182,0.0


In [65]:
indicators = {
"eys":"Expected Years of Schooling (years)",
"hdi":"Human Development Index (value)",
"le":"Life Expectancy at Birth (years)",
"mys":"Mean Years of Schooling (years)",
"mmr":"Maternal Mortality Ratio (deaths per 100,000 live births)",
"child_mortality":"Child mortality (%)",
"gnipc":"Gross National Income Per Capita (2017 PPP$)"
}

In [93]:
df_hdi_mean = df_hdi.groupby("Embedding_3")[list_of_hdi].mean().apply(lambda x: round(x,4))

In [94]:
col_hdi = [indicators[col_] for col_ in  df_hdi_mean.columns]

In [95]:
df_hdi_mean.columns = col_hdi

In [111]:
table = df_hdi_mean.to_latex()

In [113]:
print(table.replace('\\begin{tabular}{', '\\begin{tabular}{|').replace('}\n\\toprule', '|}\n\\hline').replace('\\midrule', '\\hline').replace('\\bottomrule', '\\hline'))

\begin{tabular}{|lrrrrr|}
\hline
 & Human Development Index (value) & GDP & Gross National Income Per Capita (2017 PPP$) & Expected Years of Schooling (years) & Mean Years of Schooling (years) \\
Embedding_3 &  &  &  &  &  \\
\hline
0 & 0.821700 & 948772100252.480347 & 30861.525600 & 15.232000 & 10.749400 \\
1 & 0.672800 & 13796300182.583300 & 13264.792100 & 12.754400 & 8.030700 \\
2 & 0.640400 & 91001244784.275803 & 12346.194600 & 11.959200 & 7.048700 \\
\hline
\end{tabular}



In [106]:
s = df_hdi_mean.style

In [110]:
print(s.to_latex(hrules=True))

\begin{tabular}{lrrrrr}
\toprule
 & Human Development Index (value) & GDP & Gross National Income Per Capita (2017 PPP$) & Expected Years of Schooling (years) & Mean Years of Schooling (years) \\
Embedding_3 &  &  &  &  &  \\
\midrule
0 & 0.821700 & 948772100252.480347 & 30861.525600 & 15.232000 & 10.749400 \\
1 & 0.672800 & 13796300182.583300 & 13264.792100 & 12.754400 & 8.030700 \\
2 & 0.640400 & 91001244784.275803 & 12346.194600 & 11.959200 & 7.048700 \\
\bottomrule
\end{tabular}



## Predicting fertility and mortality

In [31]:
df_emb_3.head(2)

Unnamed: 0,Node,Embedding
0,AFG,0
1,AUS,0


In [45]:
df_fert = pd.read_excel("data/metadata_country/MortalityFertilityData.xlsx")

In [46]:
df_fert_emb = df_emb_3.merge(df_fert, left_on="Node", right_on="Country Code", how="left")

In [47]:
df_fert_emb = df_fert_emb.drop("Country Code", axis=1).set_index("Node", drop=True)

In [48]:
target_list = list(df_fert_emb.columns)[1:]

In [49]:
target_list

['Fertility rate, total (births per woman)',
 'Mortality rate, adult, female (per 1,000 female adults)',
 'Mortality rate, adult, male (per 1,000 male adults)',
 'Mortality rate, infant (per 1,000 live births)']

In [52]:
def getHDITable(df_hdi,  hdi_list =  ['child_mortality', 'eys', 'gnipc', 'hdi', 'le', 'mmr', 'mys','GDP']):
    # for column in df_hdi.columns:
    #     if "Embedding" in column:
    #         if column[-1] != str(n_emb):
    #             df_hdi.drop(column, axis=1)

    # hdi_mean = df_hdi.groupby("Embedding_" + n_emb)["hdi"].mean()
    # index_min = hdi_mean[hdi_mean == min(hdi_mean)].index.values[0]
    hdi_reg = {}
    for hdi_stat in list(hdi_list):
        df = df_hdi.dropna(subset=hdi_stat)
        y = df[hdi_stat]
        X_hdi = pd.get_dummies(df["Embedding"],dtype=int, drop_first=True)
        X_hdi = sm.add_constant(X_hdi)
        model = sm.OLS(y,X_hdi)
        results = model.fit()
        hdi_reg[hdi_stat] = results

    list_of_df = []

    for hdi_stat, result in hdi_reg.items():
        params = result.params
        pvalues = result.pvalues
        index_col = pd.MultiIndex.from_tuples([(hdi_stat,"params"), (hdi_stat,"pvalues")])
        df = pd.DataFrame(columns=index_col)
        df[hdi_stat,"params"] = params
        df[hdi_stat,"pvalues"] = pvalues

        list_of_df.append(df)
    
    return pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

In [53]:
getHDITable(df_fert_emb, hdi_list = target_list)

Unnamed: 0_level_0,"Fertility rate, total (births per woman)","Fertility rate, total (births per woman)","Mortality rate, adult, female (per 1,000 female adults)","Mortality rate, adult, female (per 1,000 female adults)","Mortality rate, adult, male (per 1,000 male adults)","Mortality rate, adult, male (per 1,000 male adults)","Mortality rate, infant (per 1,000 live births)","Mortality rate, infant (per 1,000 live births)"
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,2.234,0.0,93.274,0.0,154.517,0.0,14.183,0.0
1,0.558,0.013,44.458,0.003,61.455,0.0,10.876,0.002
2,1.024,0.0,65.275,0.0,80.767,0.0,17.378,0.0


In [54]:
df_fert_emb.groupby("Embedding").mean()

Unnamed: 0_level_0,"Fertility rate, total (births per woman)","Mortality rate, adult, female (per 1,000 female adults)","Mortality rate, adult, male (per 1,000 male adults)","Mortality rate, infant (per 1,000 live births)"
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2.233867,93.274089,154.516544,14.183117
1,2.791966,137.732089,215.971661,25.059615
2,3.257375,158.549,235.283866,31.560938


### For reference

In [40]:
duncan_prestige = sm.datasets.get_rdataset("Duncan", "carData")
Y = duncan_prestige.data['income']
X = duncan_prestige.data['education']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.525
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     47.51
Date:                Mon, 25 Nov 2024   Prob (F-statistic):           1.84e-08
Time:                        13:49:42   Log-Likelihood:                -190.42
No. Observations:                  45   AIC:                             384.8
Df Residuals:                      43   BIC:                             388.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.6035      5.198      2.040      0.0