# Quality testing design
I will test the quality of an embedding both in terms of the structural properties of the nodes and in terms of the development variables.

In [56]:
import statsmodels.api as sm
import numpy as np
import requests
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis
import re

os.chdir("..") # Change to parent directory
from struc2vec.src.Helpers import *


In [None]:
# os.chdir("struc2vec")

In [60]:
df_emb_4 = pd.read_excel("embeddings\\all_nw500_1000_wl80_vs75_pca4.xlsx")[["Node","Embedding"]]
df_emb_3 = pd.read_excel("embeddings\\all_nw500_1000_wl80_vs75_pca3.xlsx")[["Node","Embedding"]]

In [61]:
G = nx.read_gexf("Graphs/0_directed.gexf")

## Predicting structural properties
To test the quality of the classification, I make a linear regression for dummy variables of the classifications as predictions of each structural property. This will give an estimate of the variance within each classification.

In [107]:
df_meta_4, df_agg_4 = get_meta_data(G, df_emb_4["Node"].values, df_emb_4["Embedding"].values)
df_meta_3, df_agg_3 = get_meta_data(G, df_emb_3["Node"].values, df_emb_3["Embedding"].values)

In [108]:
X_4 = pd.get_dummies(df_meta_4["Embedding"], dtype=int).drop(3, axis=1)
X_3 = pd.get_dummies(df_meta_3["Embedding"],  dtype=int).drop(2, axis=1)

In [109]:
struc_reg_4 = {}
for struc_prop in list(df_meta_4.columns[2:]):
    y = df_meta_4[struc_prop]
    X_4 = sm.add_constant(X_4)
    model = sm.OLS(y,X_4)
    results = model.fit()
    struc_reg_4[struc_prop] = results

In [110]:
struc_reg_3 = {}
for struc_prop in list(df_meta_3.columns[2:]):
    y = df_meta_3[struc_prop]
    X_3 = sm.add_constant(X_3)
    model = sm.OLS(y,X_3)
    results = model.fit()
    struc_reg_3[struc_prop] = results

In [111]:
# print(struc_reg["Clustering"].summary())

In [112]:
# print(struc_reg["Betweenness_centrality"].summary())

In [113]:
# print(struc_reg["Closeness_centrality"].summary())

In [114]:
# print(struc_reg["InDegree"].summary())

In [115]:
# print(struc_reg["OutDegree"].summary())

In [116]:
# print(struc_reg["Average_weight"].summary())

In [117]:
list_of_df_4 = []

for struc_prop, result in struc_reg_4.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(struc_prop,"params"), (struc_prop,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[struc_prop,"params"] = params
    df[struc_prop,"pvalues"] = pvalues

    list_of_df_4.append(df)



In [118]:
list_of_df_3 = []

for struc_prop, result in struc_reg_3.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(struc_prop,"params"), (struc_prop,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[struc_prop,"params"] = params
    df[struc_prop,"pvalues"] = pvalues

    list_of_df_3.append(df)

In [119]:
pd.concat(list_of_df_4, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,Clustering,Clustering,Betweenness_centrality,Betweenness_centrality,Closeness_centrality,Closeness_centrality,InDegree,InDegree,OutDegree,OutDegree,Average_weight,Average_weight
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,0.609,0.0,0.002,0.161,0.51,0.0,32.953,0.0,45.907,0.0,1.127,0.873
0,0.08,0.009,-0.001,0.495,-0.189,0.0,-17.726,0.036,-9.346,0.023,1.997,0.825
1,-0.025,0.413,-0.001,0.664,-0.033,0.419,-10.297,0.222,-24.385,0.0,23.307,0.01
2,-0.056,0.08,0.01,0.0,0.016,0.723,54.187,0.0,3.453,0.426,19.107,0.048


In [120]:
pd.concat(list_of_df_3, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,Clustering,Clustering,Betweenness_centrality,Betweenness_centrality,Closeness_centrality,Closeness_centrality,InDegree,InDegree,OutDegree,OutDegree,Average_weight,Average_weight
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,0.642,0.0,0.001,0.202,0.436,0.0,21.506,0.0,38.337,0.0,2.641,0.589
0,-0.125,0.0,0.008,0.0,0.09,0.012,61.637,0.0,18.148,0.0,-1.003,0.89
1,0.028,0.245,-0.001,0.491,-0.048,0.17,-12.027,0.062,-22.433,0.0,32.556,0.0


## Predicting development variables

In [71]:
df_hdi = pd.read_excel("metadata_country/metadata_country_3.xlsx")

In [72]:
df_GDP = pd.read_excel("metadata_country/GDP_2015.xlsx")[["Country Code", 2015]]

In [73]:
df_GDP.columns = ["Node", "GDP"]

In [76]:
df_GDP["GDP"] = [float(x) if x != ".." else np.nan for x in df_GDP["GDP"]]

In [77]:
df_GDP.isnull().sum()

Node    0
GDP     8
dtype: int64

In [78]:
df_hdi.isnull().sum()

Node                 0
Embedding_3          0
child_mortality    216
eys                 35
gnipc               35
hdi                 36
le                  33
mmr                 44
mys                 36
Embedding_4          0
dtype: int64

In [79]:
df_hdi = df_hdi.merge(df_GDP, on = "Node")

In [102]:
hdi_mean = df_hdi.groupby("Embedding_3")["hdi"].mean()
index_min = hdi_mean[hdi_mean == min(hdi_mean)].index.values[0]

In [85]:
for column in df_hdi.columns:
    if "Embedding" in column:
        print(column[-1] == "3")

True
False


In [103]:
def getHDITable(df_hdi,  hdi_list =  ['child_mortality', 'eys', 'gnipc', 'hdi', 'le', 'mmr', 'mys','GDP'] ,n_emb="3"):
    for column in df_hdi.columns:
        if "Embedding" in column:
            if column[-1] != str(n_emb):
                df_hdi.drop(column, axis=1)

    hdi_mean = df_hdi.groupby("Embedding_" + n_emb)["hdi"].mean()
    index_min = hdi_mean[hdi_mean == min(hdi_mean)].index.values[0]

    hdi_reg = {}
    for hdi_stat in list(hdi_list):
        df = df_hdi.dropna(subset=hdi_stat)
        y = df[hdi_stat]
        X_hdi = pd.get_dummies(df["Embedding_" + n_emb],dtype=int).drop(index_min, axis=1)
        X_hdi = sm.add_constant(X_hdi)
        model = sm.OLS(y,X_hdi)
        results = model.fit()
        hdi_reg[hdi_stat] = results

    list_of_df = []

    for hdi_stat, result in hdi_reg.items():
        params = result.params
        pvalues = result.pvalues
        index_col = pd.MultiIndex.from_tuples([(hdi_stat,"params"), (hdi_stat,"pvalues")])
        df = pd.DataFrame(columns=index_col)
        df[hdi_stat,"params"] = params
        df[hdi_stat,"pvalues"] = pvalues

        list_of_df.append(df)
    
    return pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

In [104]:
getHDITable(df_hdi, n_emb="3")

Unnamed: 0_level_0,child_mortality,child_mortality,eys,eys,gnipc,gnipc,hdi,hdi,le,le,mmr,mmr,mys,mys,GDP,GDP
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,5.941,0.444,11.959,0.0,12346.195,0.0,0.64,0.0,68.169,0.0,252.89,0.0,7.049,0.0,91001240000.0,0.593
0,19.942,0.113,3.273,0.0,18515.331,0.0,0.181,0.0,8.662,0.0,-215.307,0.0,3.701,0.0,857770900000.0,0.001
1,-4.04,0.828,0.795,0.122,918.597,0.792,0.032,0.195,1.45,0.277,-69.633,0.108,0.982,0.067,-77204940000.0,0.771


In [105]:
getHDITable(df_hdi, n_emb="4")

Unnamed: 0_level_0,child_mortality,child_mortality,eys,eys,gnipc,gnipc,hdi,hdi,le,le,mmr,mmr,mys,mys,GDP,GDP
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,5.909,0.602,12.451,0.0,15461.261,0.0,0.668,0.0,69.65,0.0,226.191,0.0,7.416,0.0,432239300000.0,0.068
0,6.004,0.654,0.227,0.685,513.925,0.889,0.021,0.453,1.055,0.464,-57.265,0.193,0.733,0.218,-319530600000.0,0.299
1,1.088,0.945,0.434,0.483,-1561.675,0.702,0.008,0.797,-0.985,0.538,-19.893,0.693,0.797,0.225,-323062800000.0,0.306
2,43.852,0.057,3.095,0.0,17904.546,0.0,0.164,0.0,8.028,0.0,-197.047,0.0,3.486,0.0,500104300000.0,0.129


In [None]:
hdi_reg = {}
for hdi_stat in list(hdi_list):
    df = df_hdi.dropna(subset=hdi_stat)
    y = df[hdi_stat]
    X_hdi = pd.get_dummies(df["Embedding"], drop_first=True ,dtype=int)
    X_hdi = sm.add_constant(X_hdi)
    model = sm.OLS(y,X_hdi)
    results = model.fit()
    hdi_reg[hdi_stat] = results

In [54]:
list_of_df = []

for hdi_stat, result in hdi_reg.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(hdi_stat,"params"), (hdi_stat,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[hdi_stat,"params"] = params
    df[hdi_stat,"pvalues"] = pvalues

    list_of_df.append(df)

In [55]:
pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,child_mortality,child_mortality,eys,eys,gnipc,gnipc,hdi,hdi,le,le,mmr,mmr,mys,mys,GDP,GDP
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,5.909,0.602,12.451,0.0,15461.261,0.0,0.668,0.0,69.65,0.0,226.191,0.0,7.416,0.0,432239300000.0,0.068
0,6.004,0.654,0.227,0.685,513.925,0.889,0.021,0.453,1.055,0.464,-57.265,0.193,0.733,0.218,-319530600000.0,0.299
1,1.088,0.945,0.434,0.483,-1561.675,0.702,0.008,0.797,-0.985,0.538,-19.893,0.693,0.797,0.225,-323062800000.0,0.306
2,43.852,0.057,3.095,0.0,17904.546,0.0,0.164,0.0,8.028,0.0,-197.047,0.0,3.486,0.0,500104300000.0,0.129


### For reference

In [3]:
duncan_prestige = sm.datasets.get_rdataset("Duncan", "carData")
Y = duncan_prestige.data['income']
X = duncan_prestige.data['education']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.525
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     47.51
Date:                Wed, 13 Nov 2024   Prob (F-statistic):           1.84e-08
Time:                        14:33:51   Log-Likelihood:                -190.42
No. Observations:                  45   AIC:                             384.8
Df Residuals:                      43   BIC:                             388.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.6035      5.198      2.040      0.0