# Analyse de réseaux : les économistes de DBPedia



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

In [None]:
###  Installation du widget jupyter en plus de la librairie
# jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-cytoscape
import ipycytoscape as cy

In [None]:
import pprint
import csv
import ast
import re

from collections import Counter, defaultdict
from operator import itemgetter

import sqlite3 as sql
import time

from importlib import reload
from shutil import copyfile

import math

## Base de données

In [None]:
### Définir les adresses des fichiers, l'existant et celui à créer
db = 'data/sparql_queries.db'

In [None]:
### Fonction qui récupère et met en forme le temps au moment de l'exécution

# définition
def timestamp_formatted_for_file_name():
    is_now = time.strftime('%Y%m%d_%H%M%S')
    return is_now

# exécution
timestamp_formatted_for_file_name()

In [None]:
##  Noter que la différence de suffixe, en soi totalement arbitraire, 
#  dépend du fait que dans la configuration de .gitignore, .sqlite est exclu du verisonnement GIT
#  contrairement à .db qui est versionné
timestamped_db_copy = 'data/sparql_queries_' + timestamp_formatted_for_file_name() + '.sqlite'

In [None]:
### Créer une copie de sauvegarde avec timestamp du fichier (versionnement manuel)
# ATTENTION : la base de données doit exister à l'endroit indiqué !
## Cette requête n'est utile que si des modifications en écriture vont être apportées à la base de données,
# afint de préserver le dernier état avant modification

## Documentation:
# https://docs.python.org/3/library/shutil.htmlcopied_db = copyfile(original_db, timestamped_db_copy)

copied_db = copyfile(db, timestamped_db_copy)
copied_db

## Relations pendant les études

Personnes ayant étudié dans la même université dans une fourchette de 12 ans : on émet l'hypothèse que ils/elles ont virtuellement pu nouer des relations qui peuvent influences les parcours et choix ultérieurs grâce à des contacts prolongés avec d'anciens camarades d'études

In [None]:
### 
#  On part de la date de naissance et on postule un croisement entre une et plusieurs années
# en admettant que l'âge des études soit approximativement le même

etudes_meme_uni = """
SELECT REPLACE(p.uri_entity , 'http://dbpedia.org/resource/', '') person_1,
REPLACE(p1.uri_entity , 'http://dbpedia.org/resource/', '') person_2,
REPLACE(p.value, 'http://dbpedia.org/resource/', '') alma_mater,
p2.value birth_year_1,
p3.value birth_year_2,
p4.value long,
p5.value lat
FROM property p
  JOIN property p1 ON p1.value = p.value AND p1.pk_property > p.pk_property AND p1.property LIKE '%almaMater'
  JOIN property p2 ON p2.uri_entity = p.uri_entity AND p2.property LIKE '%irthYe%' AND p2."source" LIKE '%8 mai 2021%'
  JOIN property p3 ON p3.uri_entity = p1.uri_entity AND p3.property LIKE '%irthYe%' AND p3."source" LIKE '%8 mai 2021%'
  LEFT JOIN property p4 ON
	p4.uri_entity = p1.value
	AND p4.property LIKE '%long'
LEFT JOIN property p5 ON
	p5.uri_entity = p1.value
	AND p5.property LIKE '%lat'
WHERE  p.property LIKE '%almaMater'
AND p3.value BETWEEN (p2.value - 6) AND (p2.value + 6);
"""

In [None]:
### Undirected version of the same graph produced using Pandas
cn = sql.connect(db)
same_univ = pd.read_sql_query(etudes_meme_uni, cn)
cn.close()
same_univ.head()

In [None]:
universities = same_univ.groupby(['alma_mater']).size()
print(len(universities))
list(universities.sort_values(ascending=False).items())[:100] #[10:20]

In [None]:
G = nx.from_pandas_edgelist(same_univ, 'person_1', 'person_2', edge_attr='alma_mater')
print(nx.info(G))

In [None]:
G.is_multigraph(), G.is_directed(), G.number_of_nodes(), G.number_of_edges(), nx.is_connected(G), nx.density(G)

In [None]:
nx.number_connected_components(G)

In [None]:
## Centralité et dispersion du degré
df_degree = pd.DataFrame(list(nx.degree(G)), columns = ['person', 'degree'])
df_degree.head()
df_degree.set_index('person',inplace=True)
df_degree.head()

In [None]:
df_degree['degree'].describe()

In [None]:
gr_degree = df_degree.groupby(['degree']).size()
gr_degree.sort_index()

In [None]:
### Distribution des naissances dans le temps. Pour mémoire : 2000 = sans année de naissance


gr_degree = df_degree.groupby(['degree']).size()


objects = [l for l in gr_degree.index]
eff = [l for l in gr_degree]

print(objects[:5])
plt.figure(figsize=(15,7))
plt.bar(objects, eff)

In [None]:
df_degree[df_degree['degree'] > 80]

In [None]:
nx.number_connected_components(G)

In [None]:
### Composantes du graphe
# Une composante principale avec 1402 individus et de multiples petites composantes
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
c_comp = nx.connected_components(G)
print(type(c_comp))
## Length of generator as set of nodes = number of nodes
df_c_comp = pd.DataFrame([len(c) for c in sorted(c_comp, key=len, reverse=True)], columns=['eff'])
gs_c_comp = df_c_comp.groupby(['eff']).size()
gs_c_comp.sort_index()

In [None]:
### Créer les sous-graphes correspondant aux composantes
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
S = [[len(c), G.subgraph(c).copy()] for c in nx.connected_components(G)]
# type(S)

In [None]:
### Choisir la/les composante(s) à 9 noeuds 
a = [[s[0], s[1]] for s in S  if s[0] == 8]
a

In [None]:
sub_g = a[0][1]
ll = [{(l[0],l[1]):l[2]['alma_mater']} for l in list(sub_g.edges.data())]
edge_labels = defaultdict(list)
{edge_labels[key].append(sub[key]) for sub in ll for key in sub} 

In [None]:

plt.figure(figsize=(20,10))
pos = nx.circular_layout(sub_g)
el = nx.draw_networkx_edge_labels(sub_g, pos, edge_labels = edge_labels, font_size = 10, font_color = 'red')
# print(edge_labels)
nx.draw_networkx(sub_g,pos)


In [None]:
### Choisir la/les composante(s) à 9 noeuds 
a7 = [[s[0], s[1]] for s in S  if s[0] == 7]
a7

In [None]:
for g in a7:
    sub_g = g[1]
    ll = [{(l[0],l[1]):l[2]['alma_mater']} for l in list(sub_g.edges.data())]
    edge_labels = defaultdict(list)
    e = {edge_labels[key].append(sub[key]) for sub in ll for key in sub} 
    plt.figure(figsize=(20,10))
    pos = nx.circular_layout(sub_g)
    el = nx.draw_networkx_edge_labels(sub_g, pos, edge_labels = edge_labels, font_size = 10, font_color = 'red')
    # print(edge_labels)
    nx.draw_networkx(sub_g,pos)

## Sous-graphe

In [None]:
### Choisir la/les composante(s) à 1402 noeuds 
a1402= [[s[0], s[1]] for s in S  if s[0] == 1402]
print(a1402)
sub_g = a1402[0][1]

In [None]:
print(nx.info(sub_g))

In [None]:
nx.is_connected(sub_g), nx.density(sub_g)

In [None]:
### Returns all maximal cliques in an undirected graph.
# https://networkx.org/documentation/stable/reference/algorithms/clique.html?highlight=cliques

cliques = [(len(c),c) for c in list(nx.find_cliques(sub_g)) if len(c) > 2]
print(len(cliques))
cliques[-3:]

In [None]:
df_cliques = pd.DataFrame([l[0] for l in cliques], columns = ['eff'])
gs_cliques = df_cliques.groupby(['eff']).size()
gs_cliques.sort_index()

In [None]:
[c for c in cliques if c[0] == 32]

In [None]:
pos = ''
cc = [c[1] for c in cliques if c[0] == 32]

sub_32 = G.subgraph(cc[0]).copy()
ll = [{(l[0],l[1]):l[2]['alma_mater']} for l in list(sub_32.edges.data())]
edge_labels = defaultdict(list)
e = {edge_labels[key].append(sub[key]) for sub in ll for key in sub} 
plt.figure(figsize=(20,10))
pos = nx.spring_layout(sub_32)
el = nx.draw_networkx_edge_labels(sub_32, pos, edge_labels = edge_labels, font_size = 7, alpha = 0.7, font_color = 'green')
# print(edge_labels)
nx.draw_networkx(sub_32,pos)

In [None]:
### https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.connectivity.kcomponents.k_components.html#networkx.algorithms.connectivity.kcomponents.k_components

##  exécution très longue : vérifier
# nx.k_components(sub_g)

In [None]:
###https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.approximation.kcomponents.k_components.html?highlight=k_components#networkx.algorithms.approximation.kcomponents.k_components

from networkx.algorithms import approximation as apxa
##  exécution très longue : vérifier
# apxa.k_components(sub_g, min_density = 0.95)

## Sous-graphe : degré > 13

In [None]:
lk_13 = list(df_degree['person'][df_degree['degree']> 13])
lk_13[:3]

In [None]:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
dk_13 = G.subgraph(lk_13).copy()
print(nx.info(dk_13))

In [None]:
nx.number_connected_components(dk_13)

In [None]:
### Composantes du graphe
# Une composante principale avec 1402 individus et de multiples petites composantes
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
c_comp = nx.connected_components(dk_13)
print(type(c_comp))
## Length of generator as set of nodes = number of nodes
df_c_comp = pd.DataFrame([len(c) for c in sorted(c_comp, key=len, reverse=True)], columns=['eff'])
gs_c_comp = df_c_comp.groupby(['eff']).size()
gs_c_comp.sort_index()

In [None]:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
S = [[len(c), G.subgraph(c).copy()] for c in nx.connected_components(dk_13)]
# type(S)

In [None]:
a = [[s[0], s[1]] for s in S  if s[0] == 542]
g542 = a[0][1]

In [None]:
fig = plt.figure(figsize=(400,200))
pos = nx.kamada_kawai_layout(g542)
nx.draw_networkx(g542, pos , font_color = 'red', font_size=30, edge_color='LightGray' )
plt.savefig("graphs/dk_13.jpg", format="jpg")
plt.savefig("graphs/dk_13.pdf", format="pdf")
plt.show()

In [None]:
node_sizes = [i[1]['betweenness'] * 100000 + 1000 for i in list(g542.nodes.data())]
print(len(node_sizes))
node_sizes[:4]

In [None]:
fig = plt.figure(figsize=(200,100))
pos = nx.kamada_kawai_layout(g542)
nx.draw_networkx(g542, pos, node_color = 'DarkBlue', font_color = 'CornflowerBlue',\
                 font_size=30, node_size=node_sizes, edge_color='LightGray' )
plt.savefig("graphs/g542.pdf", format="pdf")
plt.savefig("graphs/g542.jpg", format="jpg")
plt.show()

In [None]:
node_sizes = [i[1]['eigenvector'] * 100000 + 1000 for i in list(g542.nodes.data())]
print(len(node_sizes))
node_sizes[:4]

In [None]:
fig = plt.figure(figsize=(200,100))
pos = nx.kamada_kawai_layout(g542)
nx.draw_networkx(g542, pos, node_color = 'DarkBlue', font_color = 'CornflowerBlue',\
                 font_size=30, node_size=node_sizes, edge_color='LightGray' )
plt.savefig("graphs/g542_eigenvector.pdf", format="pdf")
plt.savefig("graphs/g542_eigenvector.jpg", format="jpg")
plt.show()

## Ajouter les paramètres de centralité du sous-graphe g542



### Centralité de degré

In [None]:
# On construit ici le dictionnaire selon le format souhaité
degree = dict([(d[0], {'degree': d[1]}) for d in nx.degree(g542)])
print(list(degree)[:3])

In [None]:
nx.set_node_attributes(g542, degree)

### Centralité de proximité

In [None]:
closeness = nx.closeness_centrality(g542)

In [None]:
nx.set_node_attributes(g542, closeness, 'closeness')

### Centralité d'intermédiarité

In [None]:
betweenness = nx.betweenness_centrality(g542)

In [None]:
nx.set_node_attributes(g542, betweenness, 'betweenness')

### Centralité d'influence ou de pouvoir (_eigenvalue_)

In [None]:
### https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html?highlight=eigenvector#networkx.algorithms.centrality.eigenvector_centrality
# noter la référence biliographique dans la documentation de la fonction

eigenvector = nx.eigenvector_centrality(g542)
nx.set_node_attributes(g542, eigenvector, 'eigenvector')

## Résultat

In [None]:
print(list(g542.nodes.data())[:5])

## Exporter les attributs pour les explorer 

In [None]:
### Exporter les attributs vers un DataFrame afin de les explorer
export = pd.DataFrame(g542.nodes.data(), columns = ['id', 'attributes'])
export

In [None]:
### Décomposer le champs attributs en colonnes
# https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
pd.json_normalize(export.attributes)

In [None]:
### appliquer au DataFrame et supprimer la colonne attributes
# en même temps mettre le nom en index
export = export.join(pd.json_normalize(export.attributes)).set_index('id')
export.drop(columns=['attributes'], inplace=True)

In [None]:
export.loc[['Klaus_Schwab']]

In [None]:
export['closeness'].describe()

In [None]:
export['betweenness'].describe()

In [None]:
export['eigenvector'].describe()

In [None]:
export.sort_values(by=['eigenvector'], ascending=False).iloc[50:90]  # .iloc[50:90]   .head(50)

In [None]:
export.loc[['Klaus_Schwab']]

In [None]:
export.loc[export.index.isin(['Klaus_Schwab'])]  ### négation :  loc[~export.index.isin(['Ellen_Johnson_Sirleaf'])]  

## Explorer les relations entre propriétés structurales (centralités) et/ou attributs

### Relations entre deux centralités

In [None]:
export[['degree','betweenness']].plot(x = 'degree', y = 'betweenness',kind='scatter',figsize=(30,30))
for i,row in list(export[['degree','betweenness']].iterrows()):
    plt.annotate(i,(row['degree'], row['betweenness']), fontsize = 9, stretch='semi-expanded')

In [None]:
export.sort_values(by=['eigenvector'], ascending=False).iloc[80:100]

In [None]:
len(export), len(export.loc[~export.index.isin(['Ellen_Johnson_Sirleaf', 'Fischer_Black', 'Shahid_Javed_Burki'])])

In [None]:
exp_Schwab = export[['closeness','eigenvector']].loc[~export.index.isin(['Ellen_Johnson_Sirleaf', 'Fischer_Black', 'Shahid_Javed_Burki'])][export['eigenvector']> 0.05]\
.sort_values('eigenvector', ascending=False).copy(deep=True)

In [None]:
exp_Schwab.sort_values('eigenvector', ascending=False).plot(x = 'closeness', y = 'eigenvector',kind='scatter',figsize=(30,40))
for i,row in list(exp_Schwab.iterrows()):
    plt.annotate(i,(row['closeness'], row['eigenvector']), fontsize = 10, stretch='expanded')
plt.savefig('plots/closeness_eigenvector_20210516.jpg')

## Ego réseau

In [None]:
schwab_ego = nx.ego_graph(g542, 'Klaus_Schwab', radius = 2, undirected=True)
print(len(schwab_ego))

In [None]:
schwab_ego

In [None]:
node_sizes = [i[1]['betweenness'] * 100000 + 1000 for i in list(schwab_ego.nodes.data())]
print(len(node_sizes))
node_sizes[:4]

In [None]:
fig = plt.figure(figsize=(200,100))
pos = nx.kamada_kawai_layout(schwab_ego)
nx.draw_networkx(schwab_ego, pos, node_color = 'DarkBlue', font_color = 'CornflowerBlue',\
                 font_size=30, node_size=node_sizes, edge_color='LightGray' )
plt.savefig("graphs/schwab_ego.pdf", format="pdf")
plt.show()

# Liens entre les Universités

In [None]:
## liens entre Universités donnés par les étudiants
liens_univ = """
SELECT REPLACE(p.value , 'http://dbpedia.org/resource/', '') univ_1,
REPLACE(p1.value, 'http://dbpedia.org/resource/', '') univ_2,
REPLACE(p.uri_entity, 'http://dbpedia.org/resource/', '') person,
p3.value birth_year,
p4.value long,
p5.value lat,
p6.value long,
p7.value lat
FROM property p
  JOIN property p1 ON p1.uri_entity = p.uri_entity AND p1.pk_property > p.pk_property AND p1.property LIKE '%almaMater'
  JOIN property p3 ON p3.uri_entity = p1.uri_entity AND p3.property LIKE '%irthYe%' AND p3."source" LIKE '%8 mai 2021%'
  LEFT JOIN property p4 ON
	p4.uri_entity = p.value
	AND p4.property LIKE '%long'
	LEFT JOIN property p5 ON
	p5.uri_entity = p.value
	AND p5.property LIKE '%lat'
	LEFT JOIN property p6 ON
	p6.uri_entity = p1.value
	AND p6.property LIKE '%long'
LEFT JOIN property p7 ON
	p7.uri_entity = p1.value
	AND p7.property LIKE '%lat'
WHERE  p.property LIKE '%almaMater';
"""

In [None]:
### Undirected version of the same graph produced using Pandas
cn = sql.connect(db)
df_liens_univ = pd.read_sql_query(liens_univ, cn)
cn.close()
df_liens_univ.head()

In [None]:
students = df_liens_univ.groupby(['person']).size()
print(len(students))
list(students.sort_values(ascending=False).items())[:10] #[10:20]

In [None]:
## Piet_Lieftinck 8 universities but errors 


In [None]:
df_students = pd.DataFrame(students, columns=['eff'])
gdf_students = df_students.groupby(['eff']).size()
gdf_students.sort_index(ascending=False)[:20]

In [None]:
GU = nx.from_pandas_edgelist(df_liens_univ, 'univ_1', 'univ_2')
type(GU)

In [None]:
### CA DOIT être un multigraphe !

In [None]:
GU.is_multigraph(), GU.is_directed(), GU.number_of_nodes(), GU.number_of_edges(), nx.is_connected(GU), nx.density(GU)

In [None]:
nx.number_connected_components(GU)

In [None]:
### Composantes du graphe
# Une composante principale avec 1505 individus et de multiples petites composantes
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
c_comp = nx.connected_components(GU)
print(type(c_comp))
## Length of generator as set of nodes = number of nodes
df_c_comp = pd.DataFrame([len(c) for c in sorted(c_comp, key=len, reverse=True)], columns=['eff'])
gs_c_comp = df_c_comp.groupby(['eff']).size()
gs_c_comp.sort_index()

In [None]:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
print([c for c in c_comp if len(c) == 3])
S = [[len(c), GU.subgraph(c).copy()] for c in nx.connected_components(GU)]
# type(S)

In [None]:
au = [[s[0], s[1]] for s in S  if s[0] == 4]
len(au)
#a

In [None]:
plt.figure(figsize=(15,7))
nx.draw_networkx(au[0][1])

au[0][1].nodes.data(), au[0][1].edges.data()

In [None]:
influences_df[influences_df['person_1']=='Stati_Statev']

In [None]:
influences_df[influences_df['alma_mater']=='University_of_National_and_World_Economy']

In [None]:
a[0][1].nodes().data()

In [None]:
list(G.nodes().data())[:3]

In [None]:
### La composante la plus grande
max_c = [[s[0], s[1]] for s in S  if s[0] == max([s[0] for s in S])]
max_c

In [None]:
plt.figure(figsize=(20,10))
nx.draw_networkx(max_c[0][1])

In [None]:
nx.number_strongly_connected_components(max_c[0][1])

In [None]:
[[a, a_dict.items()] for a, a_dict in G.adjacency()][:1]

In [None]:
G.adjacency()

In [None]:
list(G.nodes)[:5]

In [None]:
list(G.neighbors('Milton_Friedman'))

In [None]:
type(G), type(DG)

In [None]:
# list(nx.bridges(G))

In [None]:
degrees = list(nx.degree(G))
degrees[:3]

In [None]:
df_degrees = pd.DataFrame(degrees, columns=['name', 'degree']).sort_values(by='degree', ascending=False)
df_degrees.head(100)

In [None]:
gb_degrees = df_degrees.groupby(['degree']).size()
gb_degrees.sort_index() # values(by='degree', ascending=False).head(30)

In [None]:
### Distribution des naissances dans le temps. Pour mémoire : 2000 = sans année de naissance


objects = [l for l in gb_degrees.index]
eff = [l for l in gb_degrees]

plt.figure(figsize=(15,7))
plt.bar(objects, eff)

In [None]:
sup4 = [dd[0] for dd in degrees if dd[1] > 4]
len(sup4)

In [None]:
d_sup_1 = nx.restricted_view(G, [dd[0] for dd in degrees if dd[1]>1], [])
type(d_sup_1)

In [None]:
d_sup4 = nx.restricted_view(G, sup4, [])
type(d_sup4)

In [None]:
max_cli_sup4 = nx.make_max_clique_graph(d_sup4)

In [None]:
type(max_cli_sup4)

In [None]:
type(nx.k_core(d_sup4, 10))

In [None]:
list(nx.core_number(d_sup4).items())[:3]

In [None]:
kcg = nx.k_core(d_sup4)
plt.figure(figsize=(40,20))
nx.draw_networkx(kcg,pos=nx.spring_layout(kcg))

In [None]:
plt.figure(figsize=(40,20))
nx.draw_networkx(max_cli_sup4,pos=nx.spring_layout(max_cli_sup4))

In [None]:
list(max_cli_sup4.adjacency())

In [None]:
plt.figure(figsize=(40,20))
nx.draw_kamada_kawai(d_sup4)

In [None]:
GG = d_sup_1
pos = nx.spring_layout(GG)

node_sizes = [i for i in range(len(GG))]
M = GG.number_of_edges()
edge_colors = range(2, M + 2)
edge_alphas = [(5 + i) / (M + 4) for i in range(M)]
cmap = plt.cm.plasma

nodes = nx.draw_networkx_nodes(GG, pos, node_size=node_sizes, node_color="indigo")
edges = nx.draw_networkx_edges(
    GG,
    pos,
    node_size=node_sizes,
    arrowstyle="->",
    arrowsize=10,
    edge_color=edge_colors,
    edge_cmap=cmap,
    width=2,
)
# set alpha value for each edge
for i in range(M):
    edges[i].set_alpha(edge_alphas[i])

pc = mpl.collections.PatchCollection(edges, cmap=cmap)
pc.set_array(edge_colors)
plt.colorbar(pc)

ax = plt.gca()
ax.set_axis_off()
plt.show()

In [None]:


plt.figure(figsize=(20,20))
pos = graphviz_layout(d_sup_1)
nx.draw_networkx_labels(d_sup_1, pos)

In [None]:
nx.draw(G)

In [None]:
from networkx.drawing.nx_agraph import graphviz_layout
pos = graphviz_layout(G)
plt.axis('off')
nx.draw_networkx_nodes(G,pos,node_color='g',alpha = 0.8)  # draws nodes
nx.draw_networkx_edges(G,pos,edge_color='b',alpha = 0.6)  # draws edges
nx.draw_networkx_edge_labels(G,pos,edge_labels = nx.get_edge_attributes(G,'weight')) # edge lables
nx.draw_networkx_labels(G,pos) # node lables

## Relations d'influence


Préparation des données (cf. le carnet dbpedia_production):


### Requêtes préliminaires

* Requêtes d'exploration dans la base de données:

<code>
-- nombre de personnes qui influencent : 906
WITH tw1 AS (SELECT DISTINCT uri_entity FROM property p WHERE p.property LIKE '%influenced%')
SELECT count(*) FROM tw1;

-- nombre de personnes influencées : 1173
WITH tw1 AS (SELECT DISTINCT value FROM property p WHERE p.property LIKE '%influenced%')
SELECT count(*) FROM tw1;

-- nombre total de personnes dans le réseau : 1717
WITH tw1 AS (
SELECT DISTINCT uri_entity FROM property p WHERE p.property LIKE '%influenced%'
UNION 
SELECT DISTINCT value as uri_entity FROM property p WHERE p.property LIKE '%influenced%')
SELECT COUNT(*) FROM tw1;


-- economistes (entités déjà disponibles) dans le réseau : 933
-- personnes non encore connues: 784
WITH tw1 AS (
SELECT DISTINCT uri_entity FROM property p WHERE p.property LIKE '%influenced%'
UNION 
SELECT DISTINCT value as uri_entity FROM property p WHERE p.property LIKE '%influenced%')
SELECT COUNT(*) FROM tw1 LEFT JOIN entity e ON tw1.uri_entity = e.uri_entity AND e.entity_class LIKE '%Person%'
WHERE e.uri_entity IS NULL;

</code>


In [None]:
### Définir les adresses des fichiers, l'existant et celui à créer
db = 'data/sparql_queries.db'

In [None]:
### Vérifier si tous les lieux on un type
influence ="""
SELECT REPLACE(p.uri_entity, 'http://dbpedia.org/resource/', '') influencer, REPLACE(p.value, 'http://dbpedia.org/resource/', '') influenced  
FROM property p WHERE p.property LIKE '%influenced%'
AND p.uri_entity != p.value;
"""

In [None]:
# connnexion à la base de données
cn = sql.connect(db)
c = cn.cursor()

### exécuter la requëte sur la base de donées SQLite pour récupérer les valeurs que contient la ligne
c.execute(influence)
result_q = []
result_q = c.fetchall()

# fermer la connexion
cn.close()
print(len(result_q))
result_q[:4]

In [None]:
#DiGraph : directed graph
DG = nx.DiGraph()
DG.add_edges_from(result_q, color='blue')
type(DG)

In [None]:
DG.size()

In [None]:
### Propriétés du graphe
DG.is_multigraph(), DG.is_directed(), DG.number_of_nodes(), DG.number_of_edges(), \
nx.density(DG), nx.is_strongly_connected(DG), nx.is_weakly_connected(DG)

In [None]:
nx.number_strongly_connected_components(DG), nx.number_weakly_connected_components(DG)

In [None]:
### Composantes du graphe (weakly connected = indirected)
# Une composante principale avec 1505 individus et de multiples petites composantes
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
wc_comp = nx.weakly_connected_components(DG)
print(type(wc_comp))
## Length of generator as set of nodes = number of nodes
df_wc_comp = pd.DataFrame([len(c) for c in sorted(wc_comp, key=len, reverse=True)], columns=['eff'])
gs_wc_comp = df_wc_comp.groupby(['eff']).size()
gs_wc_comp.sort_index()

In [None]:
### Composantes du graphe (strongly connected = directed)
# Une composante principale avec 1505 individus et de multiples petites composantes
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
sc_comp = nx.strongly_connected_components_recursive(DG)
print(type(c_comp))
## Length of generator as set of nodes = number of nodes
df_sc_comp = pd.DataFrame([len(c) for c in sorted(sc_comp, key=len, reverse=True)], columns=['eff'])
gs_sc_comp = df_c_comp.groupby(['eff']).size()
gs_sc_comp.sort_index()

In [None]:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
S = [[len(c), DG.subgraph(c).copy()] for c in nx.weakly_connected_components(DG)]
len(S)

In [None]:
adg = [[s[0], s[1]] for s in S  if s[0] == 11]
adg

In [None]:
## https://networkx.org/documentation/stable/reference/drawing.html
plt.figure(figsize=(15,7))

pos = nx.shell_layout(adg[0][1])
#pos = nx.kamada_kawai_layout(a[0][1])

## Attention pas plus d'une cinquantaine de noeuds
nx.draw_networkx(adg[0][1], pos)

In [None]:
be_ce = nx.betweenness_centrality(DG)

In [None]:
df_be_ce = pd.DataFrame(list(be_ce.items()), columns=['name', 'be_ce'])
df_be_ce.head(30)

In [None]:
df_be_ce['be_ce'].sort_values(ascending=False)

In [None]:
l_de_ce = list(de_ce.items())

In [None]:
[[l[0],l[1]*1000] for l in l_de_ce][:3]

In [None]:
dfn = pd.DataFrame([[l[0],l[1]*1000] for l in l_de_ce], columns=['name', 'be_ce'])
dfn.head()

In [None]:
dfn.sort_values(by = ['be_ce'], ascending=False).head(10)


In [None]:
cl_ce = nx.closeness_centrality(DG)

In [None]:
df_cl_ce = pd.DataFrame(list(cl_ce.items()), columns=['name', 'cl_ce'])
df_cl_ce.head(10)

In [None]:
df_cl_ce['cl_ce'].sort_values(ascending=False)

In [None]:
l_cl_ce = list(cl_ce.items())

In [None]:
[[l[0],l[1]*1000] for l in l_cl_ce][:3]

In [None]:
dfn = pd.DataFrame([[l[0],l[1]*1000] for l in l_de_ce], columns=['name', 'be_ce'])
dfn.head()

In [None]:
dfn.sort_values(by = ['be_ce'], ascending=False).head(30)


In [None]:
max([l[1] for l in l_de_ce][:3])

In [None]:
min([l[1] for l in l_de_ce]), max([l[1] for l in l_de_ce])

In [None]:
print(max([l[1] for l in l_de_ce][50:60]))
ll = l_de_ce[50:60]
ll

In [None]:
ll = [l[1] for l in l_de_ce[50:60]]
type(ll), len(ll)

In [None]:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html
SC = [[len(c), DG.subgraph(c).copy()] for c in nx.strongly_connected_components(DG)]
len(SC)

In [None]:
sadg = [[s[0], s[1]] for s in SC  if s[0] == 3]
sadg

In [None]:
## https://networkx.org/documentation/stable/reference/drawing.html
plt.figure(figsize=(15,7))

pos = nx.shell_layout(sadg[0][1])
#pos = nx.kamada_kawai_layout(a[0][1])

## Attention pas plus d'une cinquantaine de noeuds
nx.draw_networkx(sadg[0][1], pos)

## Shortest path

In [None]:
sp = dict(nx.all_pairs_shortest_path(DG)) # , cutoff=10
len(sp)

In [None]:
list(sp.items())[3:5]

In [None]:
test_l = list(sp.items())[3:5]

In [None]:
all_items = list(sp.items())
all_paths = []
for i in all_items: # test_l:
    for k,v in i[1].items():
        if i[0] != k and len(v) > 2:
            all_paths.append([i[0], k, len(v), v])

print(all_paths[:3])

In [None]:
df_all_paths = pd.DataFrame(all_paths, columns = ['tail', 'head', 'len_path', 'list_path'])
df_all_paths.head()

In [None]:
distrib_len_path = df_all_paths.groupby('len_path').size()
distrib_len_path

In [None]:
df_all_paths[df_all_paths['len_path'].isin([12])]

In [None]:
df_all_paths[df_all_paths['len_path'].isin([12,11,10,9,8])]['list_path']

In [None]:
long_paths_nodes = []
[long_paths_nodes.extend(p) for p in df_all_paths[df_all_paths['len_path'].isin([12,11])]['list_path']] # ,10,9,8
len(list(set(long_paths_nodes)))

In [None]:
long_paths_nodes[:3]

## Créer nouveau graphe

In [None]:
LPN = DG.subgraph(long_paths_nodes).copy()
type(LPN)

In [None]:
### Propriétés du graphe
LPN.is_multigraph(), LPN.is_directed(), LPN.number_of_nodes(), LPN.number_of_edges(), \
nx.density(LPN), nx.is_strongly_connected(LPN), nx.is_weakly_connected(LPN)

In [None]:
nx.number_strongly_connected_components(LPN), nx.number_weakly_connected_components(LPN)

In [None]:
## https://networkx.org/documentation/stable/reference/drawing.html
plt.figure(figsize=(50,100))

ax = plt.subplot(111)
ax.set_title('Graph - PLT', fontsize=10)




# pos = nx.spiral_layout(LPN)
pos = nx.kamada_kawai_layout(LPN)
# pos = nx.kamada_kawai_layout(LPN)

## Attention pas plus d'une cinquantaine de noeuds
nx.draw_networkx(LPN, pos)
plt.tight_layout()
plt.savefig("graphs/LPN.png", format="PNG")
plt.show()

In [None]:
directed = ipycytoscape.CytoscapeWidget()
directed.graph.add_graph_from_networkx(LPN, directed=True)  # LPN
directed

In [None]:
nx.write_pajek(DG, "graphs/DG.net")

In [None]:
nx.write_pajek(LPN, "graphs/LPN.net")