# Análisis

Este documento explora correlaciones entre los componentes de pilares de iEcosystems, tales como 

* I-Funding
* E-Funding
* Comparative Advantage
* Impact

Así como sus relaciones con métricas globales de los ecosistemas, como aquellas métricas que describen la 

* estabilidad
* robustez
* propensión al colapso



In [11]:
#Paquetes que utilizaremos

import numpy as np 
import pandas as pd 
import matplotlib as plt

import scipy.stats as stats

In [12]:
def significant_corr(df1,df2,a,b):
    '''
    This function computed the pearson correlation between two data sets a, b, 
    and determines whether this correlation is significant or not. 
    
    Inputs:
    - df1  : a DataFrame
    - df2  : a DataFrame
    - a    : a data vector
    - b    : a data vector
    
    Output : a tuple (B,N) where B is a boolean that answers the question:
    ~do a and b present significant correlation? i.e. p-value<0.1,
    and N=0 is B=False, or N=p-val(a,b) if B=True
    '''
    if df2.empty:
        df2=df1
    r = stats.pearsonr(df1[a], df2[b])
    if r[1] < 0.1:
        return (True,r[0])
    else:
        return (False,0)
    
    
    

def print_significant_corr(df1,df2,a,b):
    '''
    This function computes the Pearson correlation between two data sets a, b, 
    and prints whether this correlation is significant or not, and the 
    corresponding Pearson correlation and p-value. 
    
    Inputs:
    - df1  : a DataFrame
    - df2  : a DataFrame
    - a    : a data vector
    - b    : a data vector
    
    Output : None, but executes print statements showing the findings about aand b.
    '''
    if df2.empty:
        df2=df1
    r = stats.pearsonr(df1[a], df2[b])
    if r[1] < 0.1:
        print(f'Correlation between:  ' + a + '   and   ' + b)
        print('**********************************************')
        print(f'Pearson Correlation: {r[0]}, p-value: {r[1]}')
        print('*Statistically significant*')
        print('***********************************************')

## Carga de Datos

En la siguiente viñeta conseguimos los siguientes DataFrames, con base en los cuales vamos a realizar nustro análisis.

In [13]:
#Medidas de iEcosystems, con promedios por pilar
df_means = pd.read_csv('PromediosPilares.csv')

grf_df = pd.read_csv('Tidy_DataFrame.csv')

ieco_grf_df = pd.read_csv('iEco_with_Graph_metrics.csv')

grf_sin_islas_nd = pd.read_csv('Gephi_sin_islas/CABA sin islas nodes.csv')
full_ieco_df = pd.read_csv('Kevin/full_iEco.csv')

full_ieco_df = full_ieco_df.rename({0:1,1:0,5:3,3:5},axis='index')
full_ieco_grf_df = pd.concat([full_ieco_df,grf_df],axis=1)


In [14]:
#DataFrame en el cual se muestra, por capital, el valor de cada uno de los pilares de iEcosystems.
# Copia del DataFrame antetior, solo que 
# Mexico->1, 
# Arg->0, 
# Chile ->2, 
# Bra->5, 
# España->4, 
# Uruguay-3

df_means

Unnamed: 0.1,Unnamed: 0,1 Foundational Institutions,2.01 I-Human Capital,2.02 I-Funding,2.03 I-Infrastructure,2.04 I-Demand,2.05 I-Culture & Incentives,3.01 E-Human Capital,3.02 E-Funding,3.03 E-Infrastructure,3.04 E-Demand,3.05 E-Culture & Incentives,4 Comparative Advantage,5 Impact
0,México,3.522,2.278,1.56,3.3175,3.11,2.515,3.115,2.3,3.6275,3.43,2.97,2.93,3.725455
1,Argentina,3.169,2.034667,1.74,3.31,2.716667,1.795,3.075,1.723333,3.7125,3.175,2.965,2.58,3.781818
2,Chile,3.913,2.580667,1.2025,3.6925,2.96,1.955,4.155,2.206667,3.9975,3.205,3.455,3.11,4.091818
3,Brasil,3.177,1.828,1.84,3.265,2.876667,2.755,2.975,1.991667,3.6775,3.46,3.438333,2.74,3.671818
4,España,3.929,2.992,1.95,3.9525,3.076667,3.83,3.19,2.423333,4.2925,3.365,2.626667,2.84,3.864545
5,Uruguay,3.78,2.171333,1.5025,3.615,2.57,1.245,3.045,2.005,3.905,2.74,3.143333,2.48,4.147576


In [15]:
#DataFrame en el cual se muestra, por capital, el valor de una colección
#de métricas globales observadas en nuestros ecosistemas

grf_df

Unnamed: 0,Ciudad,País,avg strength,weight,Degree,Weighted Degree,Eccentricidad,Clustering,Diámetro,Radio,Camino más corto promedio,Transitividad,Eficiencia Global,Small Worldness,Core Ratio,Central Point Dominance,Spectral radius
0,CABA,Argentina,3.244633,1.481579,4.385965,14.017544,4.798246,0.148791,6,3,3.355669,0.107392,0.325773,0.960807,0.495614,0.251,1.868
1,CDMX,México,3.477388,1.388629,3.658863,12.963211,5.622074,0.07594,7,4,3.820318,0.05,0.28932,0.677149,0.421405,0.202,1.91
2,Santiago de Chile,Chile,3.417211,1.462564,3.948718,13.764103,5.041026,0.134037,6,3,3.227544,0.100744,0.338889,0.906814,0.430769,0.525,1.89
3,Montevideo,Uruguay,3.227742,2.749495,7.747475,25.939394,4.479798,0.207747,6,3,3.081116,0.224523,0.360837,1.274519,0.565657,0.179,1.778
4,Madrid,España,3.518387,1.362343,3.790795,12.861925,6.079498,0.12051,8,5,3.783517,0.081917,0.294653,1.141033,0.439331,0.184,1.92
5,Sao Paulo,Brasil,3.430669,1.253704,3.37037,11.694444,6.726852,0.129928,8,4,4.324031,0.078571,0.266719,1.211794,0.421296,0.237,1.983


In [None]:
#DataFrame en el cual se muestra, por capital, el valor de 
# las métricas de iEcosystems junto con las métricas
# globales observadas en nuestros ecosistemas.

ieco_grf_df

In [None]:
ieco_global_corr_df = ieco_grf_df.corr()
pd.set_option("display.max_rows", None, "display.max_columns", None)
#ieco_global_corr_df

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(ieco_global_corr_df, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(32, 24))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(ieco_global_corr_df, mask=mask, cmap=cmap, annot=True, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
#DataFrame en el cual se muestra, por capital, el valor de 
# las métricas de iEcosystems desglosadas en sus componentes.

full_ieco_df

In [None]:
#------------------
# Exploración 1
#------------------

list1=list(ieco_df.keys())     #Pilares de iEcosystems
list2=list(grf_df.keys())      #Nombres de métricas globales

list1.remove('Unnamed: 0')     #Removemos columnas que no denotan valores numéricos
list2.remove('Ciudad')
list2.remove('País')

positive_pairs=[]
negative_pairs=[]

for key1 in list1:             #por cada métrica de iEco y global del grafo, revisamos si hay corr. significativa
    for key2 in list2:
        sig,corr=significant_corr(ieco_grf_df,ieco_grf_df,key1,key2)
        if sig==True:
            if corr>0:
                positive_pairs.append((key1,key2))                             #si correlación significativa y >0
            else:
                negative_pairs.append((key1,key2))                             #si correlación significativa y <=0

In [None]:
#El texto impreso es una lista de correlaciones significativas y positivas que encontramos (+)

for keys in positive_pairs:
    print('')
    print_significant_corr(ieco_grf_df,ieco_grf_df,keys[0],keys[1])
    print('')
    

In [None]:
#El texto impreso es una lista de correlaciones significativas y negativas que encontramos (-)

for keys in negative_pairs:
    print('')
    print_significant_corr(ieco_grf_df,ieco_grf_df,keys[0],keys[1])
    print('')
    

In [None]:
#------------------
# Exploración 2
#------------------

#list1=list(full_ieco_df.keys())
list1=['2.04a Government procurement of advanced technology (GCI)',
       '2.04b University-industry research collaborations (GII)',
       '2.04c Trade, Competition & Market scale (GII)',
       '3.04a Buyer sophistication (GCI 4.0)',
       '3.04b Domestic Market Scale (GII)']                           #lista: componentes de I-Demand
list2=list(grf_df.keys())                                             #lista: métricas globales

list2.remove('Ciudad')   #removemos columnas con valores que no son numéricos
list2.remove('País')

positive_pairs_2=[]
negative_pairs_2=[]

for key1 in list1:                     #por cada componente de IDemand, métrica global revisamos corr. significativa
    for key2 in list2:                 
        sig,corr = significant_corr(full_ieco_grf_df,full_ieco_grf_df,key1,key2)
        if sig == True:
            if corr > 0:
                positive_pairs_2.append((key1,key2))        #correlación significativa positiva
            else:
                negative_pairs_2.append((key1,key2))        #correlación significativa negativa

In [None]:
#El texto impreso es una lista de correlaciones significativas y positivas que encontramos (+)

for keys in positive_pairs_2:
    print('')
    print_significant_corr(full_ieco_grf_df,full_ieco_grf_df,keys[0],keys[1])
    print('')
    

In [None]:
#El texto impreso es una lista de correlaciones significativas y negativas que encontramos (-)

for keys in negative_pairs_2:
    print('')
    print_significant_corr(full_ieco_grf_df,full_ieco_grf_df,keys[0],keys[1])
    print('')
    

In [None]:
#------------------
# Exploración 3
#------------------

list1=list(full_ieco_df.keys())[-9:]                #lista de componentes del indicador Impacto
list2=list(grf_df.keys())                           #lista de métricas globales

list2.remove('Ciudad')    #removemos parámetros con valores no numéricos
list2.remove('País')

positive_pairs_3=[]
negative_pairs_3=[]

for key1 in list1:                                  #revisamos en cada pareja si hay corr. significativa + o -
    for key2 in list2:
        sig,corr=significant_corr(full_ieco_grf_df,full_ieco_grf_df,key1,key2)
        if sig==True:
            if corr>0:
                positive_pairs_3.append((key1,key2))
            else:
                negative_pairs_3.append((key1,key2))

In [None]:
#El texto impreso es una lista de correlaciones significativas y positivas que encontramos (+)

for keys in positive_pairs_3:
    print('')
    print_significant_corr(full_ieco_grf_df,full_ieco_grf_df,keys[0],keys[1])
    print('')
    

In [None]:
#El texto impreso es una lista de correlaciones significativas y negativas que encontramos (-)

for keys in positive_pairs_3:
    print('')
    print_significant_corr(full_ieco_grf_df,full_ieco_grf_df,keys[0],keys[1])
    print('')

In [None]:
from sklearn.linear_model import LinearRegression
from scipy import stats
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d



def lin_reg_p_val(reg,X,y):
    '''
    Esta función busca el p-value entre
    
    - X : np.array
    - y : vector de datos
    
    Es decir, mide la correlación entre un conjunto de puntos X 
    determinado por una pareja de variables y un cnjunto de valores y.
    
    Inputs : specified above
    Output : p-value(X,y)
    '''
    
    # predice el np.array X usando modelo lineal, y suma las distancias al cuadrado de esa predicción hacia y
    # luego divide entre  (X.shape[0] - X.shape[1])= (no.columnas, no.filas)
    sse = np.sum((reg.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
    
    if isinstance(sse, float):  
        
        # si sse es un valor decimal, entonces forma se, una matriz (np.array) que al cuadrado
        # es igual a diag( sse * (X^T * X)^-1 ) 
        
        se = np.array([np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(X.T, X))))])  
        
    else: 
        # si sse no es un valor decimal sino un np.array, entonces se forma se, una lista tal que
        # por cada columna i, el valor se[i]^2 = diag( sse[i] * (X^T * X)^-1 ) 
        
        se = np.array([np.sqrt(np.diagonal(sse[i] * np.linalg.inv(np.dot(X.T, X)))) for i in range(sse.shape[0])])
    
    # con estos dos últimas líneas, obtiene el p-value entre 
    # el conjunto de valores X y el conjunto de valores y
    t = reg.coef_ / se
    p = 2 * (1 - stats.t.cdf(np.abs(t), y.shape[0] - X.shape[1]))
    return p

def plot_linear_model(reg,X,y,key1,key2,predict):
    '''
    This function plots the linear modeling of a vector of variables y in terms of 
    a set of points X, determined by 2 data vectors. 
    
    Inputs:
    - reg :     regression model 
    - X :       a set of points representing some data vectors.
    - y :       data which is to be predicted from the other data vectors
    - key1 :    sring for axis label -> predicting data
    - key2 :    sring for axis label -> predicting data
    - predict : string for axis label -> predicted data
    
    Outputs: 
    None, and a ~very nice~ 3D plot showing how good 
    data collection X might be in predicting linearly data vector y.
    '''
    
    ax = plt.axes(projection='3d')      #we are about to do a 3D projection
    plt.tight_layout()
    
    a = np.linspace(1, 5, 30)         
    b = np.linspace(1, 5, 30)
    
    A,B = np.meshgrid(a,b)              #we form a grid based on a (30 equidistant values from 1 to 5) and b (x2)
    C = A*reg.coef_[0] + B*reg.coef_[1] + reg.intercept_   # plane which depicts the linear approximation of y by X
    
    
    ax.scatter3D(X[:,0], X[:,1], y, cmap='Greens')         # we scatter X information in Green
    ax.contour3D(A, B, C, 50, cmap='binary')               # we plot the 3D contour of the linear approximation
    
    
    ax.set_title('Linear Model')                           # Title of plot
    ax.set_xlabel(key1, fontsize=9)                                    # Coordinates labeling
    ax.set_ylabel(key2, fontsize=9)
    ax.set_zlabel(predict, fontsize=9)
    plt.xlim((1,5))
    plt.ylim((1,5))
    plt.show()                                             #pull down the curtains!

In [None]:
''' 
4d linear model
Has too many responses and is likely not useful. 
Didn't try higher dimensions because it was likely to happen the same (any n+1 points are in an n-plane)
'''

ieco_key_list=list(ieco_df.keys())
ieco_key_list.remove('Unnamed: 0')
ieco_key_list.remove('4 Comparative Advantage')
ieco_key_list.remove('5 Impact')                        #list of ieco pilars without 4: C.Advantage ,5: Impact


ieco_key_predictors=['4 Comparative Advantage','5 Impact']     #ieco data we want to use to predict


grf_key_list=list(grf_df.keys())                        #list of graph global metrics 
grf_key_list.remove('Ciudad')
grf_key_list.remove('País')


for key1 in ieco_key_list:     
    for key2 in ieco_key_list:
        if key1 != key2:
            for key3 in ieco_key_list:
                if key3 !=key1 and key3 !=key2:         # for every (a,b,c) different ieco metrics
                    
                    for predict in ieco_key_predictors: 
                        X=np.concatenate([ieco_df[key1].to_numpy(),ieco_df[key2].to_numpy(),ieco_df[key3].to_numpy()],axis=0)
                        X=X.reshape(6,3)
                        y=ieco_df[predict].to_numpy()
                        reg=LinearRegression().fit(X,y)
                        p=lin_reg_p_val(reg,X,y)
                        if p[0][0]<.05 and p[0][1]<.05 and p[0][2]<.05 and reg.score(X,y)>.9:
                            print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                            print('Predictor metrics: '+key1+', '+key2+', '+key3)
                            print('Predicted: '+predict)
                            print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                            print('Regression Score: ' + str(reg.score(X,y)) + ', p-value: ' + str(p) )
                            print('Regression Coefficients: ' + str(reg.coef_) )
                            print('')
                            print('')
                            print('')


In [None]:
'''
3D linear model for predicting impact and comparative advantage from iEcosystem metrics
'''

for key1 in ieco_key_list:                         # for each pair (a,b) of distinct ieco metrics
    for key2 in ieco_key_list:
        if key1 != key2:
            for predict in ieco_key_predictors:    #or each predictor from Impact and C. Advantage
                
                #we make a numpy array from the predicted values and reshape it into 2 rows, 6 columns
                X=np.concatenate([ieco_df[key1].to_numpy(),ieco_df[key2].to_numpy()],axis=0)
                X=X.reshape(6,2)
                
                #make a numpy array of predictor
                y=ieco_df[predict].to_numpy()
                
                #make a Linear regression and obtains p-value using previously defined function
                reg=LinearRegression().fit(X,y)
                p=lin_reg_p_val(reg,X,y)
                
                #if both p-values are small enough and the linear regression score is greater than 0.8
                if p[0][0]<.05 and p[0][1]<.05 and reg.score(X,y)>.8:
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Predictor metrics: '+key1+', '+key2)
                    print('Predicted: '+predict)
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Regression Score: ' + str(reg.score(X,y)) + ', p-value: ' + str(p) )
                    print('Regression Coefficients: ' + str(reg.coef_) )
                    print('Intercept: ' + str(reg.intercept_))
                    plot_linear_model(reg,X,y,key1,key2,predict)
                    print('')
                    print('')
                    print('')

In [None]:
'''
3D linear models for predicting impact and comparative advantage using graph metrics
'''

grf_key_list=list(grf_df.keys())
grf_key_list.remove('Ciudad')
grf_key_list.remove('País')                                   # list of all global metrics

ieco_key_predictors=['4 Comparative Advantage','5 Impact']    # our two predictors

for key1 in grf_key_list:
    for key2 in grf_key_list:                                 # for each pair (a,b) of distinct global metrics
        if key1 != key2:                                      
            for predict in ieco_key_predictors:               # and each predictor
                
                # we make a numpy array with the predicted values and reshape it into 2 rows, 6 columns
                X = np.concatenate([ieco_grf_df[key1].to_numpy(),ieco_grf_df[key2].to_numpy()],axis=0)
                X = X.reshape(6,2)
                
                # make a numpy array for our predictor
                y=ieco_grf_df[predict].to_numpy()
                
                #make linear regression with X,y data and obtain p-value
                reg=LinearRegression().fit(X,y)
                p=lin_reg_p_val(reg,X,y)
                
                #if p-value is small enough and the regression score is greater than 0.8:
                if p[0][0]<.05 and p[0][1]<.05 and reg.score(X,y)>.8:
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Predictor metrics: '+key1+', '+key2)
                    print('Predicted: '+predict)
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Regression Score: ' + str(reg.score(X,y)) + ', p-value: ' + str(p) )
                    print('Regression Coefficients: ' + str(reg.coef_) )
                    print('Intercept: ' + str(reg.intercept_))
                    plot_linear_model(reg,X,y,key1,key2,predict)
                    print('')
                    print('')
                    print('')

In [None]:
'''
3D linear models for predicting iEcosystem metrics using graph metrics
'''

grf_key_list=list(grf_df.keys())
grf_key_list.remove('Ciudad')
grf_key_list.remove('País')                                   # list of all global metrics

ieco_key_list=list(ieco_df.keys())
ieco_key_list.remove('Unnamed: 0')                            #iEco metrics predictors/to be predicted

for key1 in grf_key_list:
    for key2 in grf_key_list:                                 # for each pair (a,b) of distinct global metrics
        if key1 != key2:                                      
            for predict in ieco_key_list:               # and each predictor
                
                # we make a numpy array with the predicted values and reshape it into 2 rows, 6 columns
                X = np.concatenate([ieco_grf_df[key1].to_numpy(),ieco_grf_df[key2].to_numpy()],axis=0)
                X = X.reshape(6,2)
                
                # make a numpy array for our predictor
                y=ieco_grf_df[predict].to_numpy()
                
                #make linear regression with X,y data and obtain p-value
                reg=LinearRegression().fit(X,y)
                p=lin_reg_p_val(reg,X,y)
                
                #if p-value is small enough and the regression score is greater than 0.8:
                if p[0][0]<.05 and p[0][1]<.05 and reg.score(X,y)>.8:
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Predictor metrics: '+key1+', '+key2)
                    print('Predicted: '+predict)
                    print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
                    print('Regression Score: ' + str(reg.score(X,y)) + ', p-value: ' + str(p) )
                    print('Regression Coefficients: ' + str(reg.coef_) )
                    print('Intercept: ' + str(reg.intercept_))
                    plot_linear_model(reg,X,y,key1,key2,predict)
                    print('')
                    print('')
                    print('')
                