##### Primary Notion in-degree == 0 & out-degree > 0
##### Learning Outcome in-degree > 0 & out-degree == 0

In [55]:
import networkx as nx
import igraph as igraph
import pandas as pd
import numpy as np
from networkx.readwrite import json_graph
import json
import codecs
import csv
import ast
import os
from os import listdir
import scipy.stats

### Create Graph

In [56]:
def create_graphs(dataset):
    """
    Returns a networkx graph from a pandas dataframe with columns ["prerequisite", "advanced"]
    """

    G_nx = nx.DiGraph()

    for i, r in dataset.iterrows():
        G_nx.add_edge(r["prerequisite"], r["advanced"])

    return G_nx

### Create Graph

In [57]:
def create_i_graph(dataset):
    """
    Returns a networkx graph from a pandas dataframe with columns ["prerequisite", "advanced"]
    """
    print(igraph.__version__)
    terminology_vertex = set(dataset["prerequisite"].append(dataset["advanced"]))
    # I_graph = igraph.Graph(n=0, edges=None, directed=True, graph_attrs=None, vertex_attrs=None, edge_attrs=None)
    i_graph = igraph.Graph()
    # print(dataset["prerequisite"])
    for v in terminology_vertex:
        i_graph.add_vertices(v)
    for i, r in dataset.iterrows():
        i_graph.add_edge(r["prerequisite"], r["advanced"])

    return i_graph

 ## In-degree, Out-degree calcs

In [58]:
def in_out_degree(networkx_graph, igraph_graph, rater):
    # size
    num_nodes = len(nx.nodes(networkx_graph))
    num_edges = len(nx.edges(networkx_graph))

    # degrees
    max_out_degree =igraph.GraphBase.maxdegree(igraph_graph, mode="OUT")

    out_degree_dict = {}
    in_degree_dict = {}

    for (node, val) in networkx_graph.out_degree():
        if val > 5:
            out_degree_dict[node]=val

    for (node, val) in networkx_graph.in_degree():
        if val > 1:
            in_degree_dict[node]=val

    # print("out degree", repr(out_degree_dict))
    # print("in degree", repr(in_degree_dict))
    out_degree_file_name = rater+'out_degree_v2.csv'
    in_degree_file_name = rater+'in_degree_v2.csv'
    with open(out_degree_file_name, 'w') as f:
        for key in out_degree_dict.keys():
            f.write("%s,%s\n" % (key, out_degree_dict[key]))

    with open(in_degree_file_name, 'w') as f:
        for key in in_degree_dict.keys():
            f.write("%s,%s\n" % (key, in_degree_dict[key]))

    leafs = [{x:networkx_graph.in_degree(x)} for x in networkx_graph.nodes() if networkx_graph.out_degree(x)==0 and networkx_graph.in_degree(x)>=1]
    for item in leafs:
        leafs_df.loc[next(iter(item)), rater] = item[next(iter(item))]
        leafs_roots_df.loc[next(iter(item)), rater] = item[next(iter(item))]


    roots = [{x:networkx_graph.out_degree(x)}  for x in networkx_graph.nodes() if networkx_graph.in_degree(x)==0 and networkx_graph.out_degree(x)>=1]
    for item in roots:
        # set negative numbers for the # of out-degree arcs
        roots_df.loc[next(iter(item)), rater] = item[next(iter(item))]
        leafs_roots_df.loc[next(iter(item)), rater] = -item[next(iter(item))]

## Data Preparation

In [59]:
from sklearn.neighbors.tests.test_dist_metrics import n2, n1
input_dir = "annotazioni_revisionate/vecchie/"
raters={}


for file in listdir(input_dir):
    print(file)   
    name=os.path.splitext(os.path.basename(file))[0]
    print("File: ", name)
    # raters.append(name)

    # use only columns ["prerequisite", "advanced", "weight", "agreem", "revised"]
    curr_df = pd.read_csv(input_dir + file, sep=",", encoding="utf-8", usecols=[1, 2, 4, 5, 9])
    raters[name]=curr_df

    # keep only relation with agreemen > 1 or those with agreement 1 but still present after revision
    curr_df = curr_df[(curr_df["agreem"] > 1) | (curr_df["Revised"].isin(["0.5", "0,5", "1", 1, 0.5]) ) ]
    print("\tNum of relations:", curr_df.shape[0], "\n")
    #for i, rel in curr_df.iterrows():
        # if rel['prerequisite'] in concepts and rel['advanced'] in concepts:


# intialise data of leafs
# leafs_dic={'concept':[],'frosina':[],'chiara':[],'samuele':[],'andrea':[],'marco':[],'ilenia':[]}
temporal_leafs_df = pd.DataFrame(columns=['concept','frosina','chiara','samuele','andrea', 'marco', 'ilenia'])
print(temporal_leafs_df)

# intialise data of roots
# leafs_dic={'concept':[],'frosina':[],'chiara':[],'samuele':[],'andrea':[],'marco':[],'ilenia':[]}
temporal_roots_df = pd.DataFrame(columns=['concept','frosina','chiara','samuele','andrea', 'marco', 'ilenia'])

temporal_leafs_roots_df = pd.DataFrame(columns=['concept','frosina','chiara','samuele','andrea', 'marco', 'ilenia'])

combined_terminology_set=set()

combined_terminology_set = set()
for rater in raters:
    prerequisite = raters[rater]['prerequisite']
    advanced = raters[rater]['advanced']
    terminology = prerequisite.append(advanced)
    combined_terminology_set=set(terminology)

temporal_leafs_df['concept']=list(combined_terminology_set)
leafs_df = temporal_leafs_df.set_index('concept', verify_integrity=True)
temporal_roots_df['concept']=list(combined_terminology_set)
roots_df = temporal_roots_df.set_index('concept', verify_integrity=True)
temporal_leafs_roots_df['concept']=list(combined_terminology_set)
leafs_roots_df = temporal_leafs_roots_df.set_index('concept', verify_integrity=True)

for rater in raters:
    if rater=="Samu":
        samu_graph= create_graphs(raters[rater])
        samu_igraph = create_i_graph(raters[rater])
        print("Samu summary:", igraph.Graph.summary(samu_igraph))
        in_out_degree(samu_graph,samu_igraph,rater)
    elif rater=="Chiara":
        c_graph= create_graphs(raters[rater])
        c_igraph = create_i_graph(raters[rater])
        print("Chiara summary:", igraph.Graph.summary(c_igraph))
        in_out_degree(c_graph,c_igraph,rater)
    elif rater=="Frosi":
        f_graph= create_graphs(raters[rater])
        f_igraph = create_i_graph(raters[rater])
        print("Frosi summary:", igraph.Graph.summary(f_igraph))
        in_out_degree(f_graph,f_igraph,rater)
    elif rater=="Andrea_Unique":
        a_graph= create_graphs(raters[rater])
        a_igraph = create_i_graph(raters[rater])
        print("Andrea summary:", igraph.Graph.summary(a_igraph))
        in_out_degree(a_graph,a_igraph,rater)
    elif rater=="Marco":
        m_graph= create_graphs(raters[rater])
        m_igraph = create_i_graph(raters[rater])
        print("Marco summary:", igraph.Graph.summary(m_igraph))
        in_out_degree(m_graph,m_igraph,rater)
    elif rater=="Ilenia":
        i_graph= create_graphs(raters[rater])
        i_igraph = create_i_graph(raters[rater])
        print("Ilenia summary:", igraph.Graph.summary(i_igraph))
        in_out_degree(i_graph,i_igraph,rater)



Andrea_Unique.csv
File:  Andrea_Unique
	Num of relations: 233 

Samu.csv
File:  Samu
	Num of relations: 368 

Chiara.csv
File:  Chiara
	Num of relations: 275 

Frosi.csv
File:  Frosi
	Num of relations: 392 

Marco.csv
File:  Marco
	Num of relations: 253 

Ilenia.csv
File:  Ilenia
	Num of relations: 87 

Empty DataFrame
Columns: [concept, frosina, chiara, samuele, andrea, marco, ilenia]
Index: []
0.7.1
Andrea summary: IGRAPH UN-- 167 279 -- 
+ attr: name (v)
0.7.1
Samu summary: IGRAPH UN-- 266 454 -- 
+ attr: name (v)
0.7.1
Chiara summary: IGRAPH UN-- 118 291 -- 
+ attr: name (v)
0.7.1
Frosi summary: IGRAPH UN-- 221 419 -- 
+ attr: name (v)
0.7.1
Marco summary: IGRAPH UN-- 141 282 -- 
+ attr: name (v)
0.7.1
Ilenia summary: IGRAPH UN-- 96 92 -- 
+ attr: name (v)


### Create File CSV

In [60]:


leafs_df.to_csv (r'export_LOs_v2.csv', index = True, header=True)


# print(roots_df)
roots_df.to_csv (r'export_PNs_v2.csv', index = True, header=True)

# print(leafs_roots_df)
leafs_roots_df.to_csv (r'export_PNs_LOs_v2.csv', index = True, header=True)

In [52]:
print("summary statistic leafs",leafs_df.describe())
print("mean", leafs_df.mean())
print("corr", leafs_df.corr())
print("# of non null", leafs_df.count())
print("standard deviation", leafs_df.std())

summary statistic leafs        Andrea_Unique        Samu     Chiara      Frosi      Marco     Ilenia
count      100.00000  100.000000  43.000000  76.000000  62.000000  59.000000
mean         2.04000    1.920000   2.767442   2.328947   2.306452   1.050847
std          1.22202    1.211727   1.875115   1.340529   1.325521   0.221572
min          1.00000    1.000000   1.000000   1.000000   1.000000   1.000000
25%          1.00000    1.000000   1.000000   1.000000   1.000000   1.000000
50%          2.00000    2.000000   2.000000   2.000000   2.000000   1.000000
75%          2.25000    2.250000   3.500000   3.000000   3.000000   1.000000
max          8.00000    8.000000  10.000000   7.000000   6.000000   2.000000
mean frosina               NaN
chiara                NaN
samuele               NaN
andrea                NaN
marco                 NaN
ilenia                NaN
Andrea_Unique    2.040000
Samu             1.920000
Chiara           2.767442
Frosi            2.328947
Marco            2

In [53]:
print("summary statistic roots",roots_df.describe())
print("mean", roots_df.mean())
print("corr", roots_df.corr())
print("# of non null", roots_df.count())
print("standard deviation", roots_df.std())

summary statistic roots        Andrea_Unique       Samu     Chiara      Frosi      Marco     Ilenia
count      28.000000  68.000000  25.000000  58.000000  26.000000  10.000000
mean        6.285714   2.779412   4.000000   3.327586   3.461538   2.900000
std         7.639358   3.349518   5.958188   3.278788   4.456283   1.911951
min         1.000000   1.000000   1.000000   1.000000   1.000000   1.000000
25%         1.000000   1.000000   1.000000   1.000000   1.000000   1.250000
50%         3.500000   1.000000   1.000000   2.000000   2.000000   2.500000
75%         7.250000   3.000000   4.000000   4.000000   3.000000   3.750000
max        31.000000  19.000000  27.000000  17.000000  19.000000   6.000000
mean frosina               NaN
chiara                NaN
samuele               NaN
andrea                NaN
marco                 NaN
ilenia                NaN
Andrea_Unique    6.285714
Samu             2.779412
Chiara           4.000000
Frosi            3.327586
Marco            3.461538
I

In [54]:
leafs_roots_df.astype(float)
df_0=leafs_roots_df.fillna(0)
#print(df_0)
print(df_0.corr(method ='pearson'))
print(df_0.corr(method ='kendall'))
print(df_0.corr(method ='spearman'))

df_0.corr(method ='pearson').to_csv(r'pearson_LO.csv', index = False)
for rater1 in raters:
    for rater2 in raters:
        r, p = scipy.stats.pearsonr(df_0[rater1], df_0[rater2])
        print("r"+rater1+rater2, r)
        print("p"+rater1+rater2, p)

               frosina  chiara  samuele  andrea  marco  ilenia  Andrea_Unique  \
frosina            NaN     NaN      NaN     NaN    NaN     NaN            NaN   
chiara             NaN     NaN      NaN     NaN    NaN     NaN            NaN   
samuele            NaN     NaN      NaN     NaN    NaN     NaN            NaN   
andrea             NaN     NaN      NaN     NaN    NaN     NaN            NaN   
marco              NaN     NaN      NaN     NaN    NaN     NaN            NaN   
ilenia             NaN     NaN      NaN     NaN    NaN     NaN            NaN   
Andrea_Unique      NaN     NaN      NaN     NaN    NaN     NaN       1.000000   
Samu               NaN     NaN      NaN     NaN    NaN     NaN       0.420187   
Chiara             NaN     NaN      NaN     NaN    NaN     NaN       0.408188   
Frosi              NaN     NaN      NaN     NaN    NaN     NaN       0.316668   
Marco              NaN     NaN      NaN     NaN    NaN     NaN       0.368727   
Ilenia             NaN     N

In [61]:
roots_df.astype(float)
df_roots_0=roots_df.fillna(0)
#print(df_0)
# print(df_roots_0.corr(method ='pearson'))
# print(df_roots_0.corr(method ='kendall'))
# print(df_roots_0.corr(method ='spearman'))

df_roots_0.corr(method ='pearson').to_csv(r'pearson_PN.csv', index = False)
for rater1 in raters:
    for rater2 in raters:
        r, p = scipy.stats.pearsonr(df_roots_0[rater1], df_roots_0[rater2])
        print("r"+rater1+rater2, r)
        print("p"+rater1+rater2, p)

rAndrea_UniqueAndrea_Unique 0.9999999999999994
pAndrea_UniqueAndrea_Unique 0.0
rAndrea_UniqueSamu 0.436318549445489
pAndrea_UniqueSamu 7.654599691976579e-11
rAndrea_UniqueChiara 0.3676769238229676
pAndrea_UniqueChiara 6.796686259931304e-08
rAndrea_UniqueFrosi 0.33692057648300866
pAndrea_UniqueFrosi 8.868925991484161e-07
rAndrea_UniqueMarco 0.3436297304069614
pAndrea_UniqueMarco 5.182534998449098e-07
rAndrea_UniqueIlenia -0.03957214861375492
pAndrea_UniqueIlenia 0.5751014043555289
rSamuAndrea_Unique 0.436318549445489
pSamuAndrea_Unique 7.654599691976579e-11
rSamuSamu 0.9999999999999999
pSamuSamu 0.0
rSamuChiara 0.41007251503895203
pSamuChiara 1.2329878091131948e-09
rSamuFrosi 0.5038898512950133
pSamuFrosi 1.808011987046636e-14
rSamuMarco 0.34275735749897546
pSamuMarco 5.561493410502614e-07
rSamuIlenia 0.09645259822269836
pSamuIlenia 0.17101624550863775
rChiaraAndrea_Unique 0.3676769238229676
pChiaraAndrea_Unique 6.796686259931304e-08
rChiaraSamu 0.41007251503895203
pChiaraSamu 1.2329878

In [21]:
leafs_df.astype(float)
df_leafs_0=leafs_df.fillna(0)
#print(df_0)
print(df_leafs_0.corr(method ='pearson'))
print(df_leafs_0.corr(method ='kendall'))
print(df_leafs_0.corr(method ='spearman'))

         frosina  chiara  samuele  andrea  marco  ilenia
frosina      NaN     NaN      NaN     NaN    NaN     NaN
chiara       NaN     NaN      NaN     NaN    NaN     NaN
samuele      NaN     NaN      NaN     NaN    NaN     NaN
andrea       NaN     NaN      NaN     NaN    NaN     NaN
marco        NaN     NaN      NaN     NaN    NaN     NaN
ilenia       NaN     NaN      NaN     NaN    NaN     NaN
         frosina  chiara  samuele  andrea  marco  ilenia
frosina      1.0     NaN      NaN     NaN    NaN     NaN
chiara       NaN     1.0      NaN     NaN    NaN     NaN
samuele      NaN     NaN      1.0     NaN    NaN     NaN
andrea       NaN     NaN      NaN     1.0    NaN     NaN
marco        NaN     NaN      NaN     NaN    1.0     NaN
ilenia       NaN     NaN      NaN     NaN    NaN     1.0
         frosina  chiara  samuele  andrea  marco  ilenia
frosina      NaN     NaN      NaN     NaN    NaN     NaN
chiara       NaN     NaN      NaN     NaN    NaN     NaN
samuele      NaN     NaN      N

In [37]:
### Initialize

          frosina    chiara   samuele    andrea     marco  ilenia
frosina  1.000000  0.563050  0.536119  0.362470  0.665114     NaN
chiara   0.563050  1.000000  0.418964  0.380818  0.835807     NaN
samuele  0.536119  0.418964  1.000000  0.456579  0.364967     NaN
andrea   0.362470  0.380818  0.456579  1.000000  0.358915     NaN
marco    0.665114  0.835807  0.364967  0.358915  1.000000     NaN
ilenia        NaN       NaN       NaN       NaN       NaN     NaN
          frosina    chiara   samuele    andrea     marco  ilenia
frosina  1.000000  0.226214  0.086681  0.116420  0.316033     NaN
chiara   0.226214  1.000000  0.123328  0.255797  0.573452     NaN
samuele  0.086681  0.123328  1.000000  0.186674  0.203627     NaN
andrea   0.116420  0.255797  0.186674  1.000000  0.243899     NaN
marco    0.316033  0.573452  0.203627  0.243899  1.000000     NaN
ilenia        NaN       NaN       NaN       NaN       NaN     1.0
          frosina    chiara   samuele    andrea     marco  ilenia
frosina  1

In [20]:
leafs_df.astype(float)
df_leafs_0=leafs_df.fillna(0)
#print(df_0)
print(df_leafs_0.corr(method ='pearson'))
print(df_leafs_0.corr(method ='kendall'))
print(df_leafs_0.corr(method ='spearman'))

         frosina  chiara  samuele  andrea  marco  ilenia
frosina      NaN     NaN      NaN     NaN    NaN     NaN
chiara       NaN     NaN      NaN     NaN    NaN     NaN
samuele      NaN     NaN      NaN     NaN    NaN     NaN
andrea       NaN     NaN      NaN     NaN    NaN     NaN
marco        NaN     NaN      NaN     NaN    NaN     NaN
ilenia       NaN     NaN      NaN     NaN    NaN     NaN
         frosina  chiara  samuele  andrea  marco  ilenia
frosina      1.0     NaN      NaN     NaN    NaN     NaN
chiara       NaN     1.0      NaN     NaN    NaN     NaN
samuele      NaN     NaN      1.0     NaN    NaN     NaN
andrea       NaN     NaN      NaN     1.0    NaN     NaN
marco        NaN     NaN      NaN     NaN    1.0     NaN
ilenia       NaN     NaN      NaN     NaN    NaN     1.0
         frosina  chiara  samuele  andrea  marco  ilenia
frosina      NaN     NaN      NaN     NaN    NaN     NaN
chiara       NaN     NaN      NaN     NaN    NaN     NaN
samuele      NaN     NaN      N

### Initialize

In [None]:
# G_nx, G_ig, annotator = create_graphs('datasets/prova_relazione_tool.csv', dataset)
# metrics = compute_metrics(G_nx, G_ig)
# detected_cycles = detect_loops(G_nx, G_ig)
# trans_dict = detect_transitive_edges(G_nx, cutoff=metrics['diameter'])
# membership = detect_clusters(G_ig)
# graph_dict = create_graph_dict(dataset, gold_with_annotators, annotator, metrics, detected_cycles, trans_dict, membership)
# output_json = export_to_json("output_files//prova.json", graph_dict)




