# Quality testing design
I will test the quality of an embedding both in terms of the structural properties of the nodes and in terms of the development variables.

In [None]:
import statsmodels.api as sm
import numpy as np
import requests
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis
import re

os.chdir("..") # Change to parent directory
from struc2vec.src.Helpers import *


In [10]:
df_emb = pd.read_excel("embeddings\\all_nw500_1000_wl80_vs75_pca4.xlsx")[["Node","Embedding"]]

In [12]:
G = nx.read_gexf("Graphs/0_directed.gexf")

In [18]:
df_meta, df_agg = get_meta_data(G, df_emb["Node"].values, df_emb["Embedding"].values)

## Predicting structural properties
To test the quality of the classification, I make a linear regression for dummy variables of the classifications as predictions of each structural property. This will give an estimate of the variance within each classification.

In [22]:
X = pd.get_dummies(df_meta["Embedding"], drop_first=True, dtype=int)

In [30]:
struc_reg = {}
for struc_prop in list(df_meta.columns[2:]):
    y = df_meta[struc_prop]
    X = sm.add_constant(X)
    model = sm.OLS(y,X)
    results = model.fit()
    struc_reg[struc_prop] = results

In [64]:
# print(struc_reg["Clustering"].summary())

In [69]:
# print(struc_reg["Betweenness_centrality"].summary())

In [65]:
# print(struc_reg["Closeness_centrality"].summary())

In [68]:
# print(struc_reg["InDegree"].summary())

In [66]:
# print(struc_reg["OutDegree"].summary())

In [67]:
# print(struc_reg["Average_weight"].summary())

In [60]:
list_of_df = []

for struc_prop, result in struc_reg.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(struc_prop,"params"), (struc_prop,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[struc_prop,"params"] = params
    df[struc_prop,"pvalues"] = pvalues

    list_of_df.append(df)



In [63]:
pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,Clustering,Clustering,Betweenness_centrality,Betweenness_centrality,Closeness_centrality,Closeness_centrality,InDegree,InDegree,OutDegree,OutDegree,Average_weight,Average_weight
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,0.689,0.0,0.001,0.514,0.321,0.0,15.227,0.004,36.561,0.0,3.124,0.583
1,-0.105,0.0,0.001,0.779,0.156,0.0,7.429,0.32,-15.038,0.0,21.31,0.008
2,-0.136,0.0,0.011,0.0,0.205,0.0,71.913,0.0,12.799,0.001,17.11,0.049
3,-0.08,0.009,0.001,0.495,0.189,0.0,17.726,0.036,9.346,0.023,-1.997,0.825


## Predicting development variables

In [70]:
df_hdi = pd.read_excel("metadata_country/metadata_country.xlsx")

In [74]:
df_hdi.isnull().sum()

Node                 0
Embedding            0
child_mortality    216
eys                 35
gnipc               35
hdi                 36
le                  33
mmr                 44
mys                 36
dtype: int64

In [None]:
X_hdi = pd.get_dummies(df_hdi["Embedding"], drop_first=True, dtype=int)


In [90]:
df_hdi.dropna(subset="eys")

Unnamed: 0,Node,Embedding,child_mortality,eys,gnipc,hdi,le,mmr,mys
0,AFG,0,10.009,10.483,2142.689,0.479,62.659,775.693,2.124
1,AUS,2,,23.248,46562.891,0.933,82.655,5.244,12.246
2,AUT,2,,16.033,52352.336,0.910,81.159,5.671,12.077
3,BHR,3,,16.069,45765.107,0.859,79.405,15.747,9.107
4,BEL,2,,19.792,50155.165,0.924,80.890,5.203,11.894
...,...,...,...,...,...,...,...,...,...
220,TLS,0,,13.186,5649.423,0.621,67.137,285.349,5.275
221,TON,2,,15.595,5699.218,0.723,70.607,86.412,10.883
222,TKM,0,,11.611,13657.417,0.725,68.782,5.762,10.825
223,UZB,0,,11.723,6489.982,0.701,70.475,30.688,11.357


In [92]:
hdi_reg = {}
for hdi_stat in list(df_hdi.columns[2:]):
    df = df_hdi.dropna(subset=hdi_stat)
    y = df[hdi_stat]
    X_hdi = pd.get_dummies(df["Embedding"], drop_first=True, dtype=int)
    X_hdi = sm.add_constant(X_hdi)
    model = sm.OLS(y,X_hdi)
    results = model.fit()
    hdi_reg[hdi_stat] = results

In [93]:
list_of_df = []

for hdi_stat, result in hdi_reg.items():
    params = result.params
    pvalues = result.pvalues
    index_col = pd.MultiIndex.from_tuples([(hdi_stat,"params"), (hdi_stat,"pvalues")])
    df = pd.DataFrame(columns=index_col)
    df[hdi_stat,"params"] = params
    df[hdi_stat,"pvalues"] = pvalues

    list_of_df.append(df)

In [94]:
pd.concat(list_of_df, axis=1).apply(lambda x:round(x, 3))

Unnamed: 0_level_0,child_mortality,child_mortality,eys,eys,gnipc,gnipc,hdi,hdi,le,le,mmr,mmr,mys,mys
Unnamed: 0_level_1,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues,params,pvalues
const,11.913,0.13,12.679,0.0,15975.186,0.0,0.689,0.0,70.704,0.0,168.926,0.0,8.149,0.0
1,-4.916,0.712,0.207,0.714,-2075.6,0.579,-0.013,0.643,-2.039,0.164,37.371,0.422,0.064,0.914
2,37.848,0.063,2.867,0.0,17390.621,0.0,0.143,0.0,6.974,0.0,-139.783,0.002,2.753,0.0
3,-6.004,0.654,-0.227,0.685,-513.925,0.889,-0.021,0.453,-1.055,0.464,57.265,0.193,-0.733,0.218


### For reference

In [3]:
duncan_prestige = sm.datasets.get_rdataset("Duncan", "carData")
Y = duncan_prestige.data['income']
X = duncan_prestige.data['education']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.525
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     47.51
Date:                Wed, 13 Nov 2024   Prob (F-statistic):           1.84e-08
Time:                        14:33:51   Log-Likelihood:                -190.42
No. Observations:                  45   AIC:                             384.8
Df Residuals:                      43   BIC:                             388.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.6035      5.198      2.040      0.0