In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

import pathlib
import json

In [2]:
root = pathlib.Path('../quellen/artikel/extracted/')

metadata = pd.DataFrame()

for subdir in root.iterdir():
    name = subdir.name
    if subdir.is_dir():
        d = {
            'path': [],
            'title': [],
            'hash': [],
            'len': [],
        }

        for file in subdir.rglob('*.json'):
            with file.open(mode='r', encoding='utf-8') as f:
                j = json.load(f)
            d['path'].append(str(file))
            d['title'].append(j['title'])
            d['hash'].append(j['fingerprint'])
            d['len'].append(len(j['text']))

        df = pd.DataFrame(d)
        df['source'] = name
        metadata = metadata.append(df)

metadata.to_csv('text_metadata.csv')
metadata

Unnamed: 0,path,title,hash,len,source
0,../quellen/artikel/extracted/BILD/json/7/OTY7b...,Abnehmen: Warum uns Diäten sogar dicker machen,kYq6EEAgcYqFnadBpY8GQFQOV0g=,3197,BILD
1,../quellen/artikel/extracted/BILD/json/7/nlcPW...,Corona: Pandemie „hätte verhindert werden könn...,jcTLw8Lkc18vhLQTNc3VHIrtlsA=,2958,BILD
2,../quellen/artikel/extracted/BILD/json/7/Q75Ez...,„Schlag den Star“: Schnupper-Debakel: Geiss ve...,H+k1PhGNSkoQoV6MftdT8ttYLUg=,3422,BILD
3,../quellen/artikel/extracted/BILD/json/7/IVCSD...,Globus in Eschborn: Hier wächst ein 60-Mio.-Su...,cXBTuwbbo08cRwr8wmgnft3Q82Q=,470,BILD
4,../quellen/artikel/extracted/BILD/json/7/YJ59v...,Vatertag während Corona: Wo darf ich mit dem B...,JPV/cr9GJ9E3LueHGg40/WIACqA=,5753,BILD
...,...,...,...,...,...
822,../quellen/artikel/extracted/zeitonline_wir/js...,ZEIT ONLINE,lAt+EOmSwBdFEVTQtILHWmSF4rk=,435,zeitonline_wir
823,../quellen/artikel/extracted/zeitonline_wir/js...,ZEIT ONLINE,lAt+EOmSwBdFEVTQtILHWmSF4rk=,435,zeitonline_wir
824,../quellen/artikel/extracted/zeitonline_wir/js...,ZEIT ONLINE,lAt+EOmSwBdFEVTQtILHWmSF4rk=,435,zeitonline_wir
825,../quellen/artikel/extracted/zeitonline_wir/js...,ZEIT ONLINE,lAt+EOmSwBdFEVTQtILHWmSF4rk=,435,zeitonline_wir


In [3]:
sources = metadata['source'].unique()
sources

array(['BILD', 'BILD_Politik', 'FAZ_Politik', 'FAZ_Wirtschaft',
       'FOCUS_TopNews', 'INSMPresse', 'RND_de', 'RTLde',
       'SPIEGEL_Politik', 'SZ', 'Tagesspiegel', 'derspiegel', 'faznet',
       'focusfinanzen', 'focuspolitik', 'insm', 'ntvde', 'ntvde_Politik',
       'sternde', 'tonline', 'zeitonline', 'zeitonline_pol', 'welt',
       'zeitonline_wir'], dtype=object)

In [4]:
for source in sources:
    print(source)
    print('------------------')
    print(metadata[metadata['source'] == source].nunique())
    print('total:', metadata[metadata['source'] == source].shape[0])
    print('------------------')


BILD
------------------
path      22733
title     22150
hash      22203
len        5082
source        1
dtype: int64
total: 22733
------------------
BILD_Politik
------------------
path      4782
title     4748
hash      4701
len       3025
source       1
dtype: int64
total: 4782
------------------
FAZ_Politik
------------------
path      5673
title     5672
hash      4431
len       3128
source       1
dtype: int64
total: 5673
------------------
FAZ_Wirtschaft
------------------
path      4231
title     4229
hash      3109
len       2366
source       1
dtype: int64
total: 4231
------------------
FOCUS_TopNews
------------------
path      11159
title     10722
hash      10452
len        5192
source        1
dtype: int64
total: 11159
------------------
INSMPresse
------------------
path      55
title     49
hash      49
len       49
source     1
dtype: int64
total: 55
------------------
RND_de
------------------
path      12418
title     12362
hash      12363
len        5478
source      

In [5]:
for source in sources:
    px.histogram(metadata[metadata['source'] == source], x='len', title=source).show()