In [1]:
from IPython.display import Markdown, display

## Links to Project Resources

- [Trello board](https://trello.com/invite/b/BWnRAtKJ/3e7ce03017000289323e762d0ed2e304/histaware)
- [Notion Wiki](https://www.notion.so/HistAware-529aba41f84946b19d493394ef6a2748)

# Part I: Text selection

In this first phase of the project, we approach the first problem of selecting texts similar texts. Intially the scope of the research is focused on texts that deal with `energy`. However, this scope might change and/or might be expanded.

**Phases of Part I:**
- **Validate the approach to the project**:
    1. Decide whether to use title and paragraphs or only one of the two
    2. Find the most efficient way to read all the xml files
    3. Begin to label a golden set of texts that are within the scope of the research AND select the most important keywords that will be used to search for similar texts
    4. Run the text similarity ML algorithm
    5. Have the teaching assistant go throught the selection and identify mistakes
- **To think about**: how to keep the relevant information about the text fragment (i.e. newspaper origin and date)?
- **Decide the tools to use for text selection**. Current choices are:
    - Use `sentence-transformers` from UKPLab (https://github.com/UKPLab/sentence-transformers)
        - Generate embeddings on sentences (max 512 words)
        - Find similar texts
    - Use `faiss` from Facebook AI (https://github.com/facebookresearch/faiss)
        - Less documentation but seemingly more scalable
    - Use ASReview from Utrecht University ()
        - A meeting with Jonathan or Raul is necessary to understand the feasibility of this approach

### Import statements

In [39]:
import numpy as np
import pandas as pd
import logging
import re
from datetime import datetime
import xml.etree.ElementTree as et 
import collections
import sys
import os
import gzip
import shutil
import xmltodict
import pathlib
from itertools import chain
import xml.etree.ElementTree as ET
from IPython.display import display, clear_output

%matplotlib inline
%config InlineBackend.figure_format='retina'


#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)
#### /print debug information to stdout

# Find path of data folder
main_path = sys.path
# To go back to main folder
sys.path.insert(0, "..")

## Delpher Dataset

### Create a catalogue of the files

#### Find the location of each article

We save the file path and the file name into a dictionary. Then we transform the dictionary into a DataFrame so that we can later keep track of the index at which the parsing got stopped/interrupted (Dictionaries in Python do not have an order)

In [3]:
def iterate_directory(path_dir, file_type):
    """Iterate over the `path_dir` and its children and
    create a dictionary of
        - name
        - path
        - dir
    names of files found
    """
    rootdir = main_path[0]+path_dir
    file_names = {}
    list_names = []

    for subdir, dirs, files in os.walk(rootdir, topdown=True):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(str(file_type)):
                file_names["article_name"] = file
                file_names["article_path"] = filepath
                file_names["article_dir"] = subdir
                list_names.append(file_names)
                file_names = {}

    return(list_names)

In [4]:
xml_article_names = iterate_directory("/data/1950/",".xml")
article_names = pd.DataFrame.from_dict(xml_article_names)
article_names.reset_index(inplace=True)

#### Find location of each metadata and "ungizp" them

In [5]:
def iterate_directory_gz(path_dir,file_type):
    """Iterate over the `path_dir` and its children and
    create a dictionary of
        - name
        - path
        - dir
        - content
    of .gz files found.
    """
    rootdir = main_path[0]+path_dir
    gz_content = {}
    list_gzs = []
    
    for subdir, dirs, files in os.walk(rootdir, topdown=True):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(str(file_type)):
                # Create list of dict
                with gzip.open(filepath, 'rb') as f:
                #, \
                #open(filepath+".xml", "wb") as r:
                    gz_content["metadata_name"] = file+".xml"
                    gz_content["metadata_dir"] = subdir
                    gz_content["metadata_path"] = filepath+".xml"
                    # Ungzipping and writing to .xml
                    #shutil.copyfileobj(f, r, 65536)
                    
                    list_gzs.append(gz_content)
                    gz_content = {}
    
    return(list_gzs)

In [6]:
gz_metadata_files = iterate_directory_gz("/data/1950/",".gz")
metadata_files = pd.DataFrame.from_dict(gz_metadata_files)
metadata_files.reset_index(inplace=True)

### Parse XML

#### Parse articles

In [7]:
def parse_XML_article(path, art_dir, title, index):
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    Takes the filepath, file title and index integer of the df
    """
    
    xtree = et.parse(path)
    xroot = xtree.getroot()
    article = {}
    dict_articles = {}
    
    # Parse the date with regex
    match = re.search(r'\d{4}[/]\d{2}[-]\d{2}', path)
    date = datetime.strptime(match.group(), '%Y/%m-%d').date()
    
    for i, node in enumerate(xroot):
        article["article_name"] = str(title)
        article["date"] = str(date)
        article["index"] = index
        article["filepath"] = path
        article["dir"] = art_dir
        if node.tag != "p":
            article[node.tag] = node.text
        else:
            article[node.tag+"_"+str(i)] = node.text
        dict_articles[index] = article

    # Returns dict of dicts to speed up the parsing
    return dict_articles

In [8]:
def parse_XML_article_list(path, art_dir, title, index):
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    Takes the filepath, file title and index integer of the df
    """
    
    xtree = et.parse(path)
    xroot = xtree.getroot()
    list_articles = []
    
    # Parse the date with regex
    match = re.search(r'\d{4}[/]\d{2}[-]\d{2}', path)
    date = datetime.strptime(match.group(), '%Y/%m-%d').date()
    
    for i, node in enumerate(xroot):
        if node.tag == "title":
            article = {}
            article["type"] = "title"
            article["text"] = node.text
            article["article_name"] = str(title)
            article["date"] = str(date)
            article["index"] = index
            article["filepath"] = path
            article["dir"] = art_dir
            list_articles.append(article)
        else:
            article = {}
            article["type"] = "p"
            article["text"] = node.text
            article["article_name"] = str(title)
            article["date"] = str(date)
            article["index"] = index
            article["filepath"] = path
            article["dir"] = art_dir
            list_articles.append(article)

    # Returns list of dict of articles and titles
    return list_articles

#### Parse metadata

In [99]:
#Test
#temp_data = doc["didl:DIDL"]["didl:Item"]["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]

In [9]:
def parse_XML_metadata(path, met_dir, title, index):
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    Takes the filepath, file title and index integer of the df
    """
    metadata = {}
    dict_metadata = {}
    
    # Parse the date with regex
    match = re.search(r'\d{4}[/]\d{2}[-]\d{2}', path)
    date = datetime.strptime(match.group(), '%Y/%m-%d').date()
    
    # Parse DIDL XML
    with open(pathlib.Path(path), 'r') as f:
        doc = xmltodict.parse(f.read())
    temp_data = doc["didl:DIDL"]["didl:Item"]["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]

    metadata["metadata_title"] = title
    metadata["date"] = date
    metadata["index"] = index
    metadata["filepath"] = path
    metadata["dir"] = met_dir
    
    # Retrieve informations about the newspaper
    metadata["newspaper_title"] = temp_data["dc:title"]
    metadata["newspaper_date"] = temp_data["dc:date"]
    metadata["newspaper_city"] = temp_data["dcterms:spatial"][1]["#text"]
    metadata["newspaper_publisher"] = temp_data["dc:publisher"]
    metadata["newspaper_source"] = temp_data["dc:source"]
    metadata["newspaper_volume"] = temp_data["dcx:volume"]
    metadata["newspaper_issuenumber"] = temp_data["dcx:issuenumber"]
    metadata["newspaper_language"] = temp_data["dc:language"]["#text"]
    
    dict_metadata[index] = metadata

    return(dict_metadata)

**Utils Addendum**

To search for an `article_path` or `article_name` given the other, use the following:

In [301]:
#a = df_file_names.loc[df_file_names['article_name'] == "DDD_110637387_0004_articletext.xml"]
#a = df_file_names.iloc[0]
c = df_file_names.iloc[500000]

### Iterate through the files given

Currently, this loop takes ~0.012s for each parsing. This is extremely slow and it's not due to the `parse_XML` function (which is efficient), but instead it's because of the `concat` between series. 

In this way 100.000 documents take around 20 minutes to be parsed.
- If possible, substitute the concat statement with something more efficient!

In [57]:
from itertools import chain

def iterate_files_list(files, restart=False, index_restart=0):
    """Iterate through files `files`, parse them and concatenate
    the result to be saved as a DataFrame in a feather object (.ftr)
    """
    list_articles = []
    # Every 10000 articles create a ftr file
    save_each = 10000
    
    if restart == False:
        main = None
        previous_i = 0
        current_i = 0
        i = 0
        n = 0
        cnt = 0
        for index, row in files.iterrows():
            try:
                list_articles.append(parse_XML_article_list(
                    path = row["article_path"],
                    art_dir = row["article_dir"], 
                    title = row["article_name"],
                    index = row["index"]))
            except Exception as e:
                print(f"Index: {index}", e.args)
                continue
            # Each X, save the file in a .ftr
            if (i == save_each):
                current_i = current_i + i
                file_path = main_path[0]+"/data/processed/processed_articles/processed_data_list_"+str(previous_i)+"_"+str(current_i)+".ftr"
                main = pd.DataFrame(list(chain.from_iterable(list_articles)))
                main.to_feather(file_path)
                main = None
                list_articles = []
                previous_i = current_i
                i = 0
            # Each 1000 files, print the progress
            if (i % 2000 == 0):
                clear_output(wait=True)
                display("Files parsed: "+str(2000*cnt))
                display("Current file: "+row["article_name"]+" (Index: "+str(row["index"])+")")
                cnt += 1
            i += 1
    if restart == True:
        main = None
        previous_i = index_restart
        current_i = index_restart
        i = 0
        n = 0
        cnt = index_restart/2000
        for index, row in files.iloc[index_restart:].iterrows():
            try:
                list_articles.append(parse_XML_article_list(
                    path = row["article_path"],
                    art_dir = row["article_dir"], 
                    title = row["article_name"],
                    index = row["index"]))
            except Exception as e:
                print(f"Index: {index}", e.args)
                continue
            # Each X, save the file in a .ftr
            if (i == save_each):
                current_i = current_i + i
                file_path = main_path[0]+"/data/processed/processed_articles/processed_data_list_"+str(previous_i)+"_"+str(current_i)+".ftr"
                main = pd.DataFrame(list(chain.from_iterable(list_articles)))
                main.to_feather(file_path)
                main = None
                list_articles = []
                previous_i = current_i
                i = 0
            # Each 1000 files, print the progress
            if (i % 2000 == 0):
                clear_output(wait=True)
                display("Files parsed: "+str(2000*cnt))
                display("Current file: "+row["article_name"]+"(Index: "+str(row["index"])+")")
                cnt += 1
            i += 1

In [59]:
def iterate_metadata(files):
    """Iterate through files `files`, parse them and concatenate
    the result to be saved as a DataFrame in a feather object (.ftr)
    """
    main = None
    previous_i = 0
    current_i = 0
    i = 0
    n = 0
    cnt = 0
    dict_metadata = {}
    
    for index, row in files.iterrows():
        try:
            dict_metadata.update(
                parse_XML_metadata(
                    path = row["metadata_path"],
                    met_dir = row["metadata_dir"], 
                    title = row["metadata_name"],
                    index = row["index"]))
        except Exception:
            print(f"Index: {index}", e.args)
            continue
        # Each X, save the file in a .ftr
        if (i == 1000):
            current_i = current_i + i
            file_path = main_path[0]+"/data/processed/processed_metadata/processed_metadata_"+str(previous_i)+"_"+str(current_i)+".ftr"
            main = pd.DataFrame.from_dict(dict_metadata).T.reset_index()
            main.to_feather(file_path)
            main = None
            dict_metadata = {}
            previous_i = current_i
            i = 0
        # Each 100 files, print the progress
        if (i % 100 == 0):
            clear_output(wait=True)
            display("Files parsed: "+str(2000*cnt))
            display("Current file: "+row["metadata_name"]+" (Index: "+str(row["index"])+")")
            cnt += 1
        i += 1

In [63]:
iterate_files_list(article_names)

'Files parsed: 64000'

'Current file: DDD_011155446_0029_articletext.xml (Index: 64002)'

KeyboardInterrupt: 

In [300]:
iterate_metadata(metadata_files)

Files parsed: 0
Current file: DDD:ddd:110637387:mpeg21.didl.xml.gz.xml

Files parsed: 50
Current file: DDD:ddd:010865705:mpeg21.didl.xml.gz.xml

Files parsed: 100
Current file: DDD:ddd:010891636:mpeg21.didl.xml.gz.xml

Files parsed: 150
Current file: DDD:ddd:010480615:mpeg21.didl.xml.gz.xml

Files parsed: 200
Current file: DDD:ddd:010480574:mpeg21.didl.xml.gz.xml

Files parsed: 250
Current file: DDD:ddd:011155446:mpeg21.didl.xml.gz.xml

Files parsed: 300
Current file: DDD:ddd:010537413:mpeg21.didl.xml.gz.xml

Files parsed: 350
Current file: DDD:ddd:110637339:mpeg21.didl.xml.gz.xml

Files parsed: 400
Current file: DDD:ddd:010950304:mpeg21.didl.xml.gz.xml

Files parsed: 450
Current file: DDD:ddd:011202192:mpeg21.didl.xml.gz.xml

Files parsed: 500
Current file: DDD:ddd:010852499:mpeg21.didl.xml.gz.xml

Files parsed: 550
Current file: DDD:ddd:010862555:mpeg21.didl.xml.gz.xml

Files parsed: 600
Current file: DDD:ddd:010865671:mpeg21.didl.xml.gz.xml

Files parsed: 650
Current file: DDD:ddd:0

## Text selection model

## Ingest parsed files previously saved

Once we parse all the files present in the example `data-1950` folder, we produce 65 files containing the parsed original data into a format which is more easily readable by a machine. The total weight of the files is 65*10=650MB which is a 5x reduction from the original size of the dataset.

In [14]:
# https://www.sbert.net/docs/
from sentence_transformers import SentenceTransformer, LoggingHandler, util

# These are the pure transformers from huggingface
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# For saving
import pickle
import csv

# Set searborn settings
rcParams['figure.figsize'] = 12, 8

# Set fixed random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Find GPU on device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

2020-09-16 09:24:58 - PyTorch version 1.6.0 available.


## Read saved files

#### Retrieve all the names of the ftr files saved

In [64]:
ftr_articles = iterate_directory("/data/processed/processed_articles",".ftr")
ftr_articles = pd.DataFrame(ftr_articles)
ftr_articles.rename({'article_name': 'ftr_name', 'article_path': 'ftr_path', 'article_dir': 'ftr_dir'}, axis=1, inplace=True)

In [65]:
ftr_metadata = iterate_directory("/data/processed/processed_metadata",".ftr")
ftr_metadata = pd.DataFrame(ftr_metadata)
ftr_metadata.rename({'article_name': 'ftr_name', 'article_path': 'ftr_path', 'article_dir': 'ftr_dir'}, axis=1, inplace=True)

#### Retrieve all the content of the files into a list format

Read one ftr file as a test

In [78]:
def iterate_ftr(df):
    result = pd.DataFrame()
    
    for index, row in df.iterrows():
        ftr = pd.read_feather(row["ftr_path"])
        result = result.append(ftr)
    
    return(result)

In [83]:
df_articles = iterate_ftr(ftr_articles)
df_articles.sort_values(by=["index"], ascending=True)
df_articles.rename({"filepath": "article_filepath", "index": "index_article"}, axis=1, inplace=True)

In [84]:
df_metadata = iterate_ftr(ftr_metadata)
df_metadata.drop(["level_0", "date"], axis=1, inplace=True)
df_metadata.rename({"filepath": "metadata_filepath", "index": "index_metadata"}, axis=1, inplace=True)

Merge articles and metadata in one single file

In [85]:
df_joined = df_articles.merge(df_metadata, how='left', on='dir')

#### Now we have (one)a merged file

This will be needed to be done recursively, for all files present in the database.

So efficiency is key here!!!

### Find synonym(s) for the key search word(s)

In [87]:
import nl_core_news_lg
import spacy
from tqdm import tqdm
import hdbscan

In [88]:
nlp = nl_core_news_lg.load()

#### Create a list with only the text (paragraphs) and not the other variables

Retrieve all the paragraphs into one single file

In [80]:
def list_paragraphs(df):
    list_p = []

    for index, row in tqdm(df.iterrows()):
        for i in range(1,df.shape[1]):
            p = "p_"+str(i)
            try:
                if row[p] and row[p] is not None:
                    list_p.append(row[p])
            except KeyError as e:
                continue

    return(list_p)

In [81]:
def list_title(df):
    list_titles = []

    for index, row in tqdm(df.iterrows()):
        try:
            if row["title"] and row["title"] is not None:
                list_titles.append(row["title"])
        except KeyError as e:
            continue

    return(list_titles)

In [82]:
pars = list_paragraphs(df_test_joined)
titles = list_title(df_test_joined)

30001it [00:04, 6845.35it/s]


In [52]:
df_pars = pd.DataFrame(pars).reset_index()
df_pars.rename({0: 'text'}, axis=1, inplace=True)

In [26]:
df_titles = pd.DataFrame(titles).reset_index()
df_titles.rename({0: 'text'}, axis=1, inplace=True)

In [102]:
def search_synonyms(word, df, n):
    """Find all texts in which a synonym of the word appears.
    
    Takes:
        - string (word)
        - dataframe in which to search
        - The total number of synonym to retrieve
    """
    result = pd.DataFrame()
    
    ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[word]]]), n=n)
    synonyms = [nlp.vocab.strings[w] for w in ms[0][0]]
    print(f"Searching using the following synonyms of {word}:")
    print(synonyms)
    df.dropna(subset=['text'], inplace=True)
    
    for syn in tqdm(synonyms):
        result = result.append(df[df["text"].str.contains(syn, 
                                                          case=False,
                                                          regex=False)
                                 ]
                              )
    return result

In [103]:
a = search_synonyms("energie", df_joined, 50)
a.drop_duplicates(ignore_index=True, inplace=True)

Searching using the following synonyms of energie:
['energie', 'oerenergie', 'Cenergie', 'energieeen', 'energieen', 'energi', 'aarde-energie', 'lichtenergie', 'levensenergie', 'energiestroom', 'energie-boost', 'energiedip', 'hulpenergie', 'warmte-energie', 'Bio-energie', 'zonneenergie', 'Reiki-energie', 'hartenergie', 'remenergie', 'aardenergie', 'energievorm', 'energiegolf', 'energiestoot', 'energievol', 'energiegolven', 'bewegingsenergie', 'bio-energie', 'energiecellen', 'basisenergie', 'energievolle', 'energieën', 'energiebron', 'zonenergie', 'stralingsenergie', 'énergie', 'energiebalans', 'energiestromen', 'lichaamsenergie', 'energiebewustzijn', 'energietoevoer', 'energiemix', 'groepsenergie', 'energieboost', 'energievreter', 'Levensenergie', 'waterenergie', 'energie-opwekking', 'energierijk', 'energieverbuik', 'energiebronnen']


100%|██████████| 50/50 [01:57<00:00,  2.35s/it]


Using the row that were found, select the entire record from the merged dataframe

In [106]:
b

Unnamed: 0,type,text,article_name,date,index_article,article_filepath,dir,metadata_title,index_metadata,metadata_filepath,newspaper_title,newspaper_date,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language
0,p,ét Stoelrompenfabrlek te Bergschenhoek vraagt ...,DDD_010950576_0040_articletext.xml,1950-10-26,30033,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,275.0,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
1,p,"De „Sibajak"" lag vaarklaar aan de trossen, We,...",DDD_010950576_0162_articletext.xml,1950-10-26,30041,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,275.0,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
2,p,(:: --■:;- v.rr: ■■■--•■■■.■ : >-. Bekwame Aut...,DDD_010950576_0057_articletext.xml,1950-10-26,30042,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,275.0,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
3,p,ét Stoelrompenfabrlek te Bergschenhoek vraagt ...,DDD_010950576_0058_articletext.xml,1950-10-26,30050,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,275.0,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
4,p,Kaapland v Z Afrika Khyber v Japan 5 Nov - - ....,DDD_010950576_0181_articletext.xml,1950-10-26,30088,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,275.0,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,p,"Het ~Museum voor het Onderwijs in Den Haag"" is...",DDD_011199721_0056_articletext.xml,1950-03-07,43224,../data/1950/03-07/DDD_011199721/DDD_011199721...,../data/1950/03-07/DDD_011199721,DDD:ddd:011199721:mpeg21.didl.xml.gz.xml,402.0,../data/1950/03-07/DDD_011199721/DDD:ddd:01119...,De Tĳd : godsdienstig-staatkundig dagblad,1950-03-07,'s-Hertogenbosch,Gebr. Verhoeven,Koninklijke Bibliotheek C 236,105,34371,nl
652,p,Na de pauze speelde Willem Hielkema Tschaikows...,DDD_010537470_0093_articletext.xml,1950-10-21,23397,../data/1950/10-21/DDD_010537470/DDD_010537470...,../data/1950/10-21/DDD_010537470,DDD:ddd:010537470:mpeg21.didl.xml.gz.xml,217.0,../data/1950/10-21/DDD_010537470/DDD:ddd:01053...,De Heerenveensche koerier : onafhankelĳk dagbl...,1950-10-21,Leeuwarden [etc.],Stichting Je Maintiendrai Friesland,KB C 199,6,248,nl
653,p,DERDE KLASSE A: Terrasvogels—Schagen 2—o; HMS—...,DDD_010852520_0115_articletext.xml,1950-04-11,18092,../data/1950/04-11/DDD_010852520/DDD_010852520...,../data/1950/04-11/DDD_010852520,DDD:ddd:010852520:mpeg21.didl.xml.gz.xml,167.0,../data/1950/04-11/DDD_010852520/DDD:ddd:01085...,De waarheid,1950-04-11,Amsterdam,s.n.,Internationaal Instituut voor Sociale Geschied...,9,287,nl
654,p,KAMPIOENSCH VAN NEDERLAND Maurits—Blauw Wit. A...,DDD_010850977_0129_articletext.xml,1950-03-31,57221,../data/1950/03-31/DDD_010850977/DDD_010850977...,../data/1950/03-31/DDD_010850977,DDD:ddd:010850977:mpeg21.didl.xml.gz.xml,530.0,../data/1950/03-31/DDD_010850977/DDD:ddd:01085...,De waarheid,1950-03-31,Amsterdam,s.n.,Internationaal Instituut voor Sociale Geschied...,9,279,nl


In [108]:
# Save to CSV
b.to_csv(main_path[0]+"/data"+"/"+"energie_"+"search_2020.csv",
            sep=",",
            quotechar='"',
            index=False)

#with open(path[0]+"/data"+"/list_sentences.csv", 'w') as myfile:
#    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#    wr.writerow(list_sentences)

#with open('list_sentences.pkl', "wb") as fOut:
#    pickle.dump(list_sentences, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### Use the multilingual model pre-trained on 10+ languages

### Play around with `SBERT`

The model is the `distiluse-base-multilingual-cased` model. From [sbert]( https://www.sbert.net/docs/pretrained_models.html)

In [110]:
# Create embeddings
model = SentenceTransformer('distiluse-base-multilingual-cased', device=device)

# Load paragraphs
sentences = b

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

2020-09-16 11:24:09 - Load pretrained SentenceTransformer: distiluse-base-multilingual-cased
2020-09-16 11:24:09 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-09-16 11:24:09 - Load SentenceTransformer from folder: /Users/leonardovida/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distiluse-base-multilingual-cased.zip
2020-09-16 11:24:09 - loading configuration file /Users/leonardovida/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distiluse-base-multilingual-cased.zip/0_DistilBERT/config.json
2020-09-16 11:24:09 - Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "output_past": true,


KeyError: 6

In [34]:
# Sentence mining from sentence-transformers
sentences = sen[1:40]
paraphrases = util.paraphrase_mining(
    model,
    sentences,
    corpus_chunk_size=20, #len(sentences)
    query_chunk_size=20,
    top_k=20,
    max_pairs=5)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

Maar het waren juist deze geheime gegevens, die Louwers in handen wilde krijgen. Want de Tsjechoslowaakse industrie heeft op dit gebied een reusachtige voorsprong. Of nemen wij Philips! Toen de Duitsers ons land binnenvielen, vertrokken uit Eindhoven 30 auto's, volgeladen met geheime papieren: de productiegeheimen van Philips! Die moesten in veiligheid. De rest was niet belangrijk. Geen mens buiten de vesting van Philips komt over deze geheimen iets te weten. En binnen de vesting is een wetenschappelijk systeem uitgewerkt, dat er voor zorgt, dat de lagere ingenieurs en technici slechts gedeelten van een bepaald productieproces leren kennen, waarmee zij niets kunnen beginnen. Verteld wordt, dat eens een delegatie van Japanse belangstellenden Philips kwam bezoeken. De heren werden „overal" rondgeleid, ze kregen een pracht van een diner aangeboden. Maar de Philipsdirectie vermoedde dat sommigen van hen micro-fototoestellen by zich hadden. Fouilleren kon men de „geëerde gasten" niet. Dus w

In [None]:
#with open('paraphrase_test.pkl', "wb") as fOut:
#    pickle.dump(paraphrase, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
# Save embeddings
import pickle

with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

    
    
#Load sentences & embeddings from disc
with open('embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

Maar het waren juist deze geheime gegevens, die Louwers in handen wilde krijgen. Want de Tsjechoslowaakse industrie heeft op dit gebied een reusachtige voorsprong. Of nemen wij Philips! Toen de Duitsers ons land binnenvielen, vertrokken uit Eindhoven 30 auto's, volgeladen met geheime papieren: de productiegeheimen van Philips! Die moesten in veiligheid. De rest was niet belangrijk. Geen mens buiten de vesting van Philips komt over deze geheimen iets te weten. En binnen de vesting is een wetenschappelijk systeem uitgewerkt, dat er voor zorgt, dat de lagere ingenieurs en technici slechts gedeelten van een bepaald productieproces leren kennen, waarmee zij niets kunnen beginnen. Verteld wordt, dat eens een delegatie van Japanse belangstellenden Philips kwam bezoeken. De heren werden „overal" rondgeleid, ze kregen een pracht van een diner aangeboden. Maar de Philipsdirectie vermoedde dat sommigen van hen micro-fototoestellen by zich hadden. Fouilleren kon men de „geëerde gasten" niet. Dus w

### Playing around with BERTje

In [18]:
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased")
model = BertModel.from_pretrained("wietsedv/bert-base-dutch-cased")

2020-09-02 11:23:32 - Lock 5397104528 acquired on /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2.lock
2020-09-02 11:23:32 - https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmppdbo_bik


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241440.0, style=ProgressStyle(descripti…

2020-09-02 11:23:33 - storing https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt in cache at /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2
2020-09-02 11:23:33 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2
2020-09-02 11:23:33 - Lock 5397104528 released on /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2.lock
2020-09-02 11:23:33 - loading file https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt from cache at /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74




2020-09-02 11:23:34 - Lock 5357248464 acquired on /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119.lock
2020-09-02 11:23:34 - https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmpqyn8q3s_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

2020-09-02 11:23:34 - storing https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json in cache at /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119
2020-09-02 11:23:34 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119
2020-09-02 11:23:34 - Lock 5357248464 released on /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119.lock
2020-09-02 11:23:34 - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json from cache at /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d




2020-09-02 11:23:35 - Lock 5390180048 acquired on /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53.lock
2020-09-02 11:23:35 - https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmp1wjgs76_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438869143.0, style=ProgressStyle(descri…

2020-09-02 11:24:36 - storing https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin in cache at /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53
2020-09-02 11:24:36 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53
2020-09-02 11:24:36 - Lock 5390180048 released on /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53.lock
2020-09-02 11:24:36 - loading weights file https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin from cache at /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c036




2020-09-02 11:24:40 - All model checkpoint weights were used when initializing BertModel.

2020-09-02 11:24:40 - All the weights of BertModel were initialized from the model checkpoint at wietsedv/bert-base-dutch-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertModel for predictions without further training.
