In [15]:
import pandas as pd
import os
import numpy as np
import networkx as nx
import igraph as ig
import glob
from measures.BCC import BCC
from measures.CC import ClusteringCoefficient
from measures.EC import EmbeddingControversy
from measures.GMCK import BoundaryConnectivity
from measures.MBLB import MBLB
from measures.modularity import Modularity
from measures.PI import PolarizationIndex
from measures.RWC import RWC
from polarization_analysis.utils import postings_df_to_graph, partition, normalize
from measures.utils import get_config, get_logger, get_partitions, normalize_graph, get_node_percentage
from measures.utils import __read_partition_file
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [10]:
txtfile = open("results_postings_min_2_weight.txt")
lines = txtfile.readlines()
data = []
for line in lines:
    line = line.strip()
    elements = line.split("  ")
    if len(elements) > 3:
        data.append(elements)

df = pd.DataFrame(data=data, columns=['docid', 'mean', 'measure', 'result'])
df["result"] = pd.to_numeric(df["result"])
def filtermeasure(text):
  if text in ['EmbeddingControversy', 'CORR30', 'BC', 'BoundaryConnectivity', 'MBLB', 'Modularity', 'PolarizationIndex', 'RWC', 'UMAP30']:
    return 1
  else:
    return 0

df = df[df['measure'].apply(filtermeasure) != 0]

In [11]:
df.head(10)

Unnamed: 0,docid,mean,measure,result
1,2000067189399,mean,BoundaryConnectivity,0.172323
2,2000067189399,mean,EmbeddingControversy,3.9e-05
3,2000067189399,mean,MBLB,0.612621
4,2000067189399,mean,Modularity,0.439481
5,2000067189399,mean,PolarizationIndex,0.721775
6,2000067189399,mean,RWC,0.255764
8,2000103364196,mean,BoundaryConnectivity,0.226936
9,2000103364196,mean,MBLB,0.762456
10,2000103364196,mean,Modularity,0.472476
11,2000103364196,mean,PolarizationIndex,0.861755


In [12]:
df.describe()

Unnamed: 0,result
count,674.0
mean,0.410481
std,0.257516
min,3.9e-05
25%,0.195652
50%,0.433732
75%,0.617704
max,0.951047


In [13]:
df[['measure', 'result']].groupby(['measure']).describe()


Unnamed: 0_level_0,result,result,result,result,result,result,result,result
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
measure,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BoundaryConnectivity,124.0,0.195396,0.043365,0.021429,0.174358,0.195085,0.225221,0.307198
EmbeddingControversy,64.0,0.017723,0.017069,3.9e-05,0.004853,0.012077,0.026445,0.074457
MBLB,124.0,0.574411,0.155332,0.124384,0.463406,0.590526,0.675513,0.881308
Modularity,124.0,0.4454,0.038828,0.112883,0.435136,0.451123,0.465223,0.488824
PolarizationIndex,124.0,0.76279,0.109862,0.188966,0.698379,0.780088,0.843317,0.951047
RWC,114.0,0.265426,0.159091,0.010126,0.138462,0.247373,0.346088,0.719943


In [16]:
posting_dirs = glob.glob('./cache/postings*')
for posting_dir in posting_dirs:
    articles = glob.glob(os.path.join('.', posting_dir, '*'))
    for article in articles:
        gml_path = glob.glob(os.path.join('.', article, '*.gml'))
        graph = nx.read_gml(gml_path[0], label='id')
        iggraph: ig.Graph = ig.read(gml_path)
        g, node_mapping = normalize(graph)
        left_path = glob.glob(os.path.join('.', article, 'left', '*.txt'))
        right_path = glob.glob(os.path.join('.', article, 'right', '*.txt'))
        left_part = __read_partition_file(left_path)
        right_part = __read_partition_file(right_path)
        doc_id = article
        measures_dict = {
            "BCC": BCC(g, iggraph, node_mapping, left_part, right_part, doc_id, cache=False),
            "BC": BoundaryConnectivity(g, iggraph, node_mapping, left_part, right_part, doc_id),
            # ClusteringCoefficient(g, iggraph, node_mapping, left_part, right_part, doc_id),
            "EC": EmbeddingControversy(g, iggraph, node_mapping, left_part, right_part, doc_id),
            "ECU(corr)": EmbeddingControversy(g, iggraph, node_mapping, left_part, right_part, doc_id, 'umap', 15, 'correlation'),
            "ECU(n30)": EmbeddingControversy(g, iggraph, node_mapping, left_part, right_part, doc_id, 'umap', 30),
            "MBLB": MBLB(g, iggraph, node_mapping, left_part, right_part, doc_id, percent=percent),
            "Modularity": Modularity(g, iggraph, node_mapping, left_part, right_part, doc_id),
            "PolarizationIndex": PolarizationIndex(g, iggraph, node_mapping, left_part, right_part, doc_id, cache=False),
            "RWC": RWC(g, iggraph, node_mapping, left_part, right_part, doc_id, percent=percent)
        }
        percent = get_node_percentage(g.number_of_nodes())

TypeError: expected str, bytes or os.PathLike object, not list

In [25]:
import glob
import os

import pandas as pd

from measures.utils import get_logger

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

logger = get_logger('main')
posting_dirs = glob.glob('./cache/postings*')
df_all = pd.DataFrame(
    columns=['measure', 'result', 'title', 'removed_edges', 'ratio_edges_removed', 'largest_cc', 'ratio_largest_cc',
             'experiment', 'doc_id'])
for posting_dir in posting_dirs:
    articles = glob.glob(os.path.join('.', posting_dir, '*'))
    experiment = posting_dir.split(os.sep)[-1]
    if experiment == 'postings_weight2':
        break
    for article in articles:
        doc_id = article.split(os.sep)[-1]
        # gml_path = glob.glob(os.path.join('.', article, '*.gml'))[0]
        # graph = nx.read_gml(gml_path, label='id')
        statistics_path = glob.glob(os.path.join('.', article, '*statistics.csv'))[0]
        df_statistics = pd.read_csv(statistics_path,
                                    names=['removed_edges', 'ratio_edges_removed', 'largest_cc', 'ratio_largest_cc'],
                                    header=0)
        scores_path = glob.glob(os.path.join('.', article, 'scores_*.csv'))[0]
        df_scores = pd.read_csv(scores_path,
                                names=['idx', 'measure', 'result'], header=0)
        df_scores = df_scores.drop(columns=['idx'])

        doc_title = glob.glob(os.path.join('.', article, '*.txt'))[0].split(os.sep)[-1].split('.txt')[0]
        df_scores['title'] = doc_title
        df_scores['removed_edges'] = df_statistics.iloc[0]['removed_edges']
        df_scores['ratio_edges_removed'] = df_statistics.iloc[0]['ratio_edges_removed']
        df_scores['largest_cc'] = df_statistics.iloc[0]['largest_cc']
        df_scores['ratio_largest_cc'] = df_statistics.iloc[0]['ratio_largest_cc']
        df_scores['experiment'] = experiment
        df_scores['doc_id'] = doc_id
        df_all = pd.concat([df_all, df_scores])

In [26]:
df_all.describe()

Unnamed: 0,result,removed_edges,ratio_edges_removed,largest_cc,ratio_largest_cc
count,44881.0,44881.0,44881.0,44881.0,44881.0
mean,0.295765,81.767563,1.758221,286.75308,0.617001
std,0.251519,102.216277,2.491356,414.718784,0.382218
min,-1.094336,0.0,0.0,3.0,0.015152
25%,0.121976,0.0,0.0,32.0,0.176136
50%,0.288625,0.0,0.0,146.0,0.865385
75%,0.448897,198.0,3.444444,189.0,0.955947
max,0.985714,288.0,11.545455,2495.0,1.0


In [22]:
df_all.head(5)

Unnamed: 0,measure,result,title,removed_edges,ratio_edges_removed,largest_cc,ratio_largest_cc,experiment,doc_id
0,BCC,0.481717,Das_Comeback_der_Theologie_im_Westen,0.0,0.0,122.0,0.953125,postings_500,2000009955231
1,BC,0.173902,Das_Comeback_der_Theologie_im_Westen,0.0,0.0,122.0,0.953125,postings_500,2000009955231
2,EC,0.33785,Das_Comeback_der_Theologie_im_Westen,0.0,0.0,122.0,0.953125,postings_500,2000009955231
3,ECU(corr),0.073312,Das_Comeback_der_Theologie_im_Westen,0.0,0.0,122.0,0.953125,postings_500,2000009955231
4,ECU(n30),0.041831,Das_Comeback_der_Theologie_im_Westen,0.0,0.0,122.0,0.953125,postings_500,2000009955231


In [27]:
df = df_all.copy()
df = df[df["measure"] == 'BC']
df.describe()

Unnamed: 0,result,removed_edges,ratio_edges_removed,largest_cc,ratio_largest_cc
count,4659.0,4659.0,4659.0,4659.0,4659.0
mean,0.147698,87.615368,1.889238,226.270659,0.591711
std,0.070325,103.333414,2.541508,350.943246,0.38377
min,-0.070988,0.0,0.0,3.0,0.015152
25%,0.108161,0.0,0.0,29.0,0.15847
50%,0.155085,0.0,0.0,134.0,0.821596
75%,0.197009,200.0,3.616667,179.0,0.945055
max,0.375,288.0,11.545455,2495.0,1.0


In [28]:
df.sort_values(axis=0, by=('result'), ascending=False).drop_duplicates(subset=['title'])

Unnamed: 0,measure,result,title,removed_edges,ratio_edges_removed,largest_cc,ratio_largest_cc,experiment,doc_id
1,BC,0.375000,Rotes_Kreuz_bangt_um_Jobs_für_Sanitäter,219.0,4.055556,35.0,0.230263,postings_500_weight2,2000011003206
28,BC,0.330272,Ich_bin_ein_Hofer-Anhänger_Diskonter-Lkws_irritieren_Kunden,0.0,0.000000,165.0,0.800971,postings_500,2000035860022
55,BC,0.328571,Eine_14-jährige_Youtuberin_ist_der_neue_Kinderstar_der_Rechtsextremen,233.0,5.547619,14.0,0.058824,postings_500_weight2,2000103161690
55,BC,0.325000,Druck_auf_Frankreichs_Behörden_wächst,245.0,4.900000,23.0,0.106481,postings_500_weight2,2000025854723
37,BC,0.303571,Grüne_wollen_Hofburg_trumpsicher_machen,211.0,4.057692,14.0,0.072539,postings_500_weight2,2000053945808
73,BC,0.303571,Evakuierung_Ost-Aleppos_soll_fortgesetzt_werden,217.0,4.018519,40.0,0.303030,postings_500_weight2,2000049468554
82,BC,0.300000,Wenn_Essen_und_Trinken_krankmacht,222.0,3.313433,42.0,0.202899,postings_500_weight2,2000077364979
82,BC,0.299812,Anschlagspläne_vereitelt_Antiterroreinsatz_in_der_Nähe_von_Paris,204.0,3.044776,38.0,0.180952,postings_500_weight2,2000033617679
82,BC,0.280952,Nach_Anschlag_in_London_Polizei_verhaftet_18-Jährigen,200.0,2.941176,46.0,0.232323,postings_500_weight2,2000064153100
1,BC,0.275000,Immer_mehr_Volksschüler_verbreiten_Sexvideos_und_Nacktbilder,248.0,5.904762,12.0,0.060606,postings_500_weight2,2000098052467


In [29]:
df = df_all.copy()
df = df[df["measure"] == 'BC']
df = df[df["experiment"] == 'postings_top']
df.sort_values(axis=0, by=('result'), ascending=False).drop_duplicates(subset=['title'])

Unnamed: 0,measure,result,title,removed_edges,ratio_edges_removed,largest_cc,ratio_largest_cc,experiment,doc_id
1,BC,0.194809,Dem_Expertenkabinett_des_Kanzlers_droht_ein_schnelles_Ende,0.0,0.0,1069.0,0.974476,postings_top,2000103578888
1,BC,0.182986,Vassilakou_Die_Botschaft_der_Wähler_ist_angekommen,0.0,0.0,1105.0,0.890411,postings_top,2000066131530
46,BC,0.178722,Red-Bull-Sender_Servus_TV_stellt_Betrieb_ein_geplanter_Betriebsrat_als_Mitgrund,0.0,0.0,1063.0,0.899323,postings_top,2000036209341
1,BC,0.178073,Harald_Vilimsky_droht_Armin_Wolf_und_erntet_Kritik_von_Medienminister_Blümel,0.0,0.0,1184.0,0.902439,postings_top,2000101935041
1,BC,0.175625,Welcher_Austausch_der_Bevölkerung_in_Österreich_tatsächlich_stattfindet,0.0,0.0,943.0,0.950605,postings_top,2000102386715
1,BC,0.174583,Richter_Bei_Kopftuchverbot_auch_kein_Kreuz_im_Gericht,0.0,0.0,969.0,0.970942,postings_top,2000050505888
1,BC,0.171820,Maurer_kündigt_Berufung_an_Notfalls_gehe_ich_bis_nach_Straßburg,0.0,0.0,836.0,0.968714,postings_top,2000088998236
1,BC,0.167422,Kurz_legt_Erdoğan-Anhängern_Verlassen_Österreichs_nahe,0.0,0.0,1283.0,0.961769,postings_top,2000041493584
1,BC,0.166611,Vorläufiges_Endergebnis_Hofer_353_Van_der_Bellen_213,0.0,0.0,1083.0,0.971300,postings_top,2000035634582
1,BC,0.166144,Wahlkarten_ausgezählt_Alexander_Van_der_Bellen_ist_Bundespräsident,0.0,0.0,1050.0,0.903614,postings_top,2000037495444


In [12]:
df_gc = df_all.copy()
df_gc[['measure', 'result']].groupby(['measure']).describe()



Unnamed: 0_level_0,result,result,result,result,result,result,result,result
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
measure,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BC,2000.0,0.171062,0.049502,-0.070988,0.146907,0.177308,0.203511,0.330272
BCC,2000.0,0.458271,0.09119,0.259132,0.392332,0.448694,0.513578,0.739235
EC,2000.0,0.199155,0.120168,-0.157503,0.104755,0.185663,0.283807,0.55928
ECU(corr),2000.0,0.220359,0.11652,-0.848982,0.148594,0.220641,0.30196,0.50355
ECU(n30),2000.0,0.07067,0.124059,-1.094336,0.009775,0.06778,0.12887,0.549828
MBLB,2000.0,0.5039,0.13442,0.131298,0.413121,0.502134,0.593248,0.924107
Modularity,2000.0,0.417891,0.045048,0.181968,0.401613,0.426672,0.447825,0.477006
PI,2000.0,0.653659,0.141275,0.173435,0.582505,0.669076,0.753877,0.903204
RWC,2000.0,0.135414,0.238293,-0.629763,-0.023091,0.119603,0.279135,0.814018


In [30]:
df_all.to_csv('results_merged.csv')

