# Application of backboning to stackoverflow daten

In [48]:
import os
import pandas as pd
import networkx as nx
import itertools 
from collections import defaultdict
from scipy.stats import binom, pearsonr, sem

#functions written by Coscia and Neffke 
import backboning_python3_networkx2_Coscia as backboning

### Investigate data

In [3]:
# read in co-occurance data generated by Ole
#nodes = pd.read_csv("../data/node_info.csv")
coocc_table = pd.read_csv("../data/edge_list.csv")

In [4]:
#nodes.head()

In [5]:
coocc_table.head()

Unnamed: 0,tag1,tag2,co_occurance_count,tag1_count,tag2_count,total_num_obs,lift
0,visual-studio-2010,wpf,542,21596,66089,6563585,2.492516
1,enterprise-library,wpf,16,806,66089,6563585,1.9715
2,c#,wpf,37582,549189,66089,6563585,6.79626
3,visual-studio,wpf,747,38364,66089,6563585,1.933787
4,solution,wpf,5,350,66089,6563585,1.418776


In [6]:
coocc_table[(coocc_table["tag1"] == "c#") & (coocc_table["tag2"] == "visual-studio-2010")]

Unnamed: 0,tag1,tag2,co_occurance_count,tag1_count,tag2_count,total_num_obs,lift
851,c#,visual-studio-2010,5982,549189,21596,6563585,3.310491


In [7]:
coocc_table[(coocc_table["tag2"] == "c#") & (coocc_table["tag1"] == "visual-studio-2010")] # Network is undirected, but edge is not present in both directions => problem for backboning algorithm 

Unnamed: 0,tag1,tag2,co_occurance_count,tag1_count,tag2_count,total_num_obs,lift


In [37]:
#Select only the colums we need
df = coocc_table[["tag1", "tag2", "co_occurance_count"]].copy()
unique_tag1 = len(df["tag1"].unique())
unique_tag2 = len(df["tag2"].unique())
all_tags = set(list(df["tag1"].unique()) + list(df["tag2"].unique()))
print(f"# unique tag1: {unique_tag1}")
print(f"# unique tag2: {unique_tag2}")
print(f"# all tags present: {len(all_tags)}")
df.head()

# unique tag1: 8887
# unique tag2: 8477
# all tags present: 8889


Unnamed: 0,tag1,tag2,co_occurance_count
0,visual-studio-2010,wpf,542
1,enterprise-library,wpf,16
2,c#,wpf,37582
3,visual-studio,wpf,747
4,solution,wpf,5


**WARNING: It does not make any sense that number of unique tag1 != number of unique tag2!**

=> For unidirected network, each edge connecting two nodes has to show up in both directions!

In [13]:
# investigate which tags occur only once
difference_in_tags = list(set(df["tag1"].unique()) - set(df["tag2"].unique()))
print(difference_in_tags)

['dblink', 'selection-sort', 'qgis', 'azure-webjobssdk', 'akka-cluster', 'android-things', 'psychopy', 'kendo-ui-angular2', 'androidx', 'kurento', 'qnetworkaccessmanager', 'buildozer', 'google-play-console', 'clearcase-ucm', 'surveymonkey', 'f2py', 'rselenium', 'riscv', 'onsen-ui', 'biztalk-2013', 'testcafe', 'dust.js', 'pkcs#7', 'appium-android', 'xtend', 'net-http', 'facebook-canvas', 'arcore', 'xbee', 'obiee', 'heapsort', 'nvcc', 'usbserial', 'sinch', 'c++98', 'longlistselector', 'clips', 'args', 'apigee', 'gridgain', 'googlemock', 'tkinter-entry', 'tensorflow-lite', 'fragmentmanager', 'parsley.js', 'caesar-cipher', 'pos-tagger', 'jinternalframe', 'ggvis', 'greendao', 'tableau-server', 'phasset', 'web-push', 'pi', 'nightmare', 'format-specifiers', 'datagridviewcolumn', 'yolo', 'dotnetnuke-module', 'cgpath', 'coinbase-api', 'cgridview', 'sweetalert2', 'acumatica', 'can-bus', 'imputation', 'kie', 'suitescript2.0', 'rust-cargo', 'databricks', 'folium', 'prestashop-1.5', 'aiohttp', 'web

In [23]:
df[(df["tag1"] == "visual-studio-2010")].head()

Unnamed: 0,tag1,tag2,co_occurance_count
0,visual-studio-2010,wpf,542


In [22]:
df[(df["tag2"] == "visual-studio-2010") & (df["tag1"] == "wpf")]

Unnamed: 0,tag1,tag2,co_occurance_count


### Alter table in a way to represent undirected network

In [24]:
df.head()

Unnamed: 0,tag1,tag2,co_occurance_count
0,visual-studio-2010,wpf,542
1,enterprise-library,wpf,16
2,c#,wpf,37582
3,visual-studio,wpf,747
4,solution,wpf,5


In [38]:
all_tags = set(list(df["tag1"].unique()) + list(df["tag2"].unique()))

In [49]:
combinations = [i for i in itertools.permutations(all_tags, r=2)]
len(combinations) #number matches the expectation (len(unique_tags)^2 - len(unique_tags))

79005432

In [None]:
def backbone_network(network, measure): 
    
    table, original_nodes, original_edges = backboning.read("../../Backboning_Coscia/backboning/country_networks.csv", network, undirected = (network == "cs"))
    
    edge_table = settings_tab2.measures[measure](table, undirected = (network == "cs")) # backboning takes place here 
    
    if settings_tab2.fixedges_thresholds[measure][network] != None:        
        edge_table_thresholded = backboning.thresholding(edge_table, settings_tab2.fixedges_thresholds[measure][network])
        threshold = settings_tab2.fixedges_thresholds[measure][network]
        print(f"Threshold used is {threshold}.\n") #set by Coscia et al. Unclear where it comes from.
    else:
        print("Threshold was NONE.")
        edge_table_thresholded = edge_table.copy()
        
    
    original_nodes_thresh = len(set(edge_table_thresholded["src"]) | set(edge_table_thresholded["trg"]))
    original_edges_thresh = edge_table_thresholded.shape[0]

    print(f"Number of original nodes pre-thresholding: {original_nodes}")
    print(f"Number of original edges pre-thresholding: {int(original_edges)} \n")
    print(f"Number of original nodes post-thresholding: {original_nodes_thresh}")
    print(f"Number of original edges post-thresholding: {int(original_edges_thresh)} \n")
    
    print(f"Difference in nodes: {original_nodes-original_nodes_thresh}")
    print(f"Difference in edges: {int(original_edges-original_edges_thresh)}")
    
    G_threshold = nx.from_pandas_edgelist(edge_table_thresholded, "src", "trg")
    G = nx.from_pandas_edgelist(edge_table, "src", "trg")
    G_raw = nx.from_pandas_edgelist(table, "src", "trg")
    
    return G, G_threshold, G_raw