In [66]:
from IPython.display import Markdown, display

## Links to Project Resources

- [Trello board](https://trello.com/invite/b/BWnRAtKJ/3e7ce03017000289323e762d0ed2e304/histaware)
- [Notion Wiki](https://www.notion.so/HistAware-529aba41f84946b19d493394ef6a2748)

# Part I: Text selection

In this first phase of the project, we approach the first problem of selecting texts similar texts. Intially the scope of the research is focused on texts that deal with `energy`. However, this scope might change and/or might be expanded.

**Phases of Part I:**
- **Validate the approach to the project**:
    1. Decide whether to use title and paragraphs or only one of the two
    2. Find the most efficient way to read all the xml files
    3. Begin to label a golden set of texts that are within the scope of the research AND select the most important keywords that will be used to search for similar texts
    4. Run the text similarity ML algorithm
    5. Have the teaching assistant go throught the selection and identify mistakes
- **To think about**: how to keep the relevant information about the text fragment (i.e. newspaper origin and date)?
- **Decide the tools to use for text selection**. Current choices are:
    - Use `sentence-transformers` from UKPLab (https://github.com/UKPLab/sentence-transformers)
        - Generate embeddings on sentences (max 512 words)
        - Find similar texts
    - Use `faiss` from Facebook AI (https://github.com/facebookresearch/faiss)
        - Less documentation but seemingly more scalable
    - Use ASReview from Utrecht University ()
        - A meeting with Jonathan or Raul is necessary to understand the feasibility of this approach

### Import statements

In [67]:
import numpy as np
import pandas as pd
import logging
import re
from datetime import datetime
import xml.etree.ElementTree as et 
import collections
import sys
import os
import gzip
import shutil
import xmltodict
import pathlib
from itertools import chain
import xml.etree.ElementTree as ET

%matplotlib inline
%config InlineBackend.figure_format='retina'


#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)
#### /print debug information to stdout

# Find path of data folder
main_path = sys.path
# To go back to main folder
sys.path.insert(0, "..")

## Delpher Dataset

### Create a catalogue of the files

#### Find the location of each article

We save the file path and the file name into a dictionary. Then we transform the dictionary into a DataFrame so that we can later keep track of the index at which the parsing got stopped/interrupted (Dictionaries in Python do not have an order)

In [68]:
def iterate_directory(path_dir, file_type):
    """Iterate over the `path_dir` and its children and
    create a dictionary of
        - name
        - path
        - dir
    names of files found
    """
    rootdir = main_path[0]+path_dir
    file_names = {}
    list_names = []

    for subdir, dirs, files in os.walk(rootdir, topdown=True):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(str(file_type)):
                file_names["article_name"] = file
                file_names["article_path"] = filepath
                file_names["article_dir"] = subdir
                list_names.append(file_names)
                file_names = {}

    return(list_names)

In [69]:
xml_article_names = iterate_directory("/data/1950/",".xml")
article_names = pd.DataFrame.from_dict(xml_article_names)
article_names.reset_index(inplace=True)

#### Find location of each metadata and "ungizp" them

In [70]:
def iterate_directory_gz(path_dir,file_type):
    """Iterate over the `path_dir` and its children and
    create a dictionary of
        - name
        - path
        - dir
        - content
    of .gz files found.
    """
    rootdir = main_path[0]+path_dir
    gz_content = {}
    list_gzs = []
    
    for subdir, dirs, files in os.walk(rootdir, topdown=True):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(str(file_type)):
                # Create list of dict
                with gzip.open(filepath, 'rb') as f:
                #, \
                #open(filepath+".xml", "wb") as r:
                    gz_content["metadata_name"] = file+".xml"
                    gz_content["metadata_dir"] = subdir
                    gz_content["metadata_path"] = filepath+".xml"
                    # Ungzipping and writing to .xml
                    #shutil.copyfileobj(f, r, 65536)
                    
                    list_gzs.append(gz_content)
                    gz_content = {}
    
    return(list_gzs)

In [97]:
gz_metadata_files = iterate_directory_gz("/data/1950/",".gz")
metadata_files = pd.DataFrame.from_dict(gz_metadata_files)
metadata_files.reset_index(inplace=True)

### Parse XML

#### Parse articles

In [104]:
def parse_XML_article(path, art_dir, title, index):
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    Takes the filepath, file title and index integer of the df
    """
    
    xtree = et.parse(path)
    xroot = xtree.getroot()
    dict_articles = []
    
    # Parse the date with regex
    match = re.search(r'\d{4}[/]\d{2}[-]\d{2}', path)
    date = datetime.strptime(match.group(), '%Y/%m-%d').date()
    
    for i, node in enumerate(xroot):
        if node.tag == "title":
            article = {}
            article["type"] = "title"
            article["text"] = node.text
            article["article_name"] = str(title)
            article["date"] = str(date)
            article["index"] = index
            article["filepath"] = path
            article["dir"] = art_dir
            dict_articles.append(article)
        else:
            article = {}
            article["type"] = "p"
            article["text"] = node.text
            article["article_name"] = str(title)
            article["date"] = str(date)
            article["index"] = index
            article["filepath"] = path
            article["dir"] = art_dir
            dict_articles.append(article)

    # Returns dict of dicts to speed up the parsing
    return dict_articles

In [105]:
parse_XML_article(path = a["article_path"][1], art_dir = a["article_dir"][1], title = a["article_name"][1], index = 1)

[{'type': 'title',
  'text': 'Aanvaring',
  'article_name': 'DDD_110637387_0039_articletext.xml',
  'date': '1950-04-27',
  'index': 1,
  'filepath': '../data/1950/04-27/DDD_110637387/DDD_110637387_0039_articletext.xml',
  'dir': '../data/1950/04-27/DDD_110637387'},
 {'type': 'p',
  'text': 'Hongkong Het Chinese 1 communistische persbureau „Nieuw China" meldt dat het £ IGOO ton metende Chinese schip ( „Hainan" in de nacht van 20 1 April in de Noord-Chinese zee- i straat Dairen en Tsjefoe in bot- < sing kwam met een 8000 ton me- > tende Amerikaanse koopvaarder c „California Bear". Nieuw Chi- * na beschuldigt het Amerikaans schip van overtreding van de in- £ ternationale navigatie bepalin- i gen ; het zou een half uur ge- l wacht hebben alvorens een boot uit te zetten om de Chinezen te c helpen. Vier en twintig overle- venden werden door het Ameri- 8 kranse schip te Tiensin aan wal r gebracht. r',
  'article_name': 'DDD_110637387_0039_articletext.xml',
  'date': '1950-04-27',
  'index': 

In [78]:
a = pd.DataFrame(article_names.loc[article_names['article_name'] == "DDD_110637387_0039_articletext.xml"])

#### Parse metadata

In [8]:
#Test
#temp_data = doc["didl:DIDL"]["didl:Item"]["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]

In [9]:
def parse_XML_metadata(path, met_dir, title, index):
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    Takes the filepath, file title and index integer of the df
    """
    metadata = {}
    dict_metadata = {}
    
    # Parse the date with regex
    match = re.search(r'\d{4}[/]\d{2}[-]\d{2}', path)
    date = datetime.strptime(match.group(), '%Y/%m-%d').date()
    
    # Parse DIDL XML
    with open(pathlib.Path(path), 'r') as f:
        doc = xmltodict.parse(f.read())
    temp_data = doc["didl:DIDL"]["didl:Item"]["didl:Component"][0]["didl:Resource"]["srw_dc:dcx"]

    metadata["metadata_title"] = title
    metadata["date"] = date
    metadata["index"] = index
    metadata["filepath"] = path
    metadata["dir"] = met_dir
    
    # Retrieve informations about the newspaper
    metadata["newspaper_title"] = temp_data["dc:title"]
    metadata["newspaper_date"] = temp_data["dc:date"]
    metadata["newspaper_city"] = temp_data["dcterms:spatial"][1]["#text"]
    metadata["newspaper_publisher"] = temp_data["dc:publisher"]
    metadata["newspaper_source"] = temp_data["dc:source"]
    metadata["newspaper_volume"] = temp_data["dcx:volume"]
    metadata["newspaper_issuenumber"] = temp_data["dcx:issuenumber"]
    metadata["newspaper_language"] = temp_data["dc:language"]["#text"]
    
    dict_metadata[index] = metadata

    return(dict_metadata)

**Utils Addendum**

To search for an `article_path` or `article_name` given the other, use the following:

In [301]:
#a = df_file_names.loc[df_file_names['article_name'] == "DDD_110637387_0004_articletext.xml"]
#a = df_file_names.iloc[0]
c = df_file_names.iloc[500000]

### Iterate through the files given

Currently, this loop takes ~0.012s for each parsing. This is extremely slow and it's not due to the `parse_XML` function (which is efficient), but instead it's because of the `concat` between series. 

In this way 100.000 documents take around 20 minutes to be parsed.
- If possible, substitute the concat statement with something more efficient!

In [114]:
from itertools import chain

def iterate_files(files):
    """Iterate through files `files`, parse them and concatenate
    the result to be saved as a DataFrame in a feather object (.ftr)
    """
    main = None
    previous_i = 0
    current_i = 0
    i = 0
    n = 0
    cnt = 0
    dict_articles = []
    
    for index, row in files.iterrows():
        try:
            dict_articles.append(parse_XML_article(
                    path = row["article_path"],
                    art_dir = row["article_dir"], 
                    title = row["article_name"],
                    index = row["index"]))
        except Exception as e:
            print(e.message, e.args)
            continue
        # Each X, save the file in a .ftr
        if (i == 10000):
            current_i = current_i + i
            file_path = main_path[0]+"/data/processed/processed_articles/processed_data_test"+str(previous_i)+"_"+str(current_i)+".ftr"
            main = pd.DataFrame(list(chain.from_iterable(dict_articles)))
            #main = pd.DataFrame(dict_articles)
            return(main)
            main.to_feather(file_path)
            main = None
            previous_i = current_i
            i = 0
        # Each 1000 files, print the progress
        if (i % 2000 == 0):
            print("Files parsed: "+str(2000*cnt))
            print("Current file: "+row["article_name"]+"\n")
            cnt += 1
        i += 1

In [11]:
def iterate_metadata(files):
    """Iterate through files `files`, parse them and concatenate
    the result to be saved as a DataFrame in a feather object (.ftr)
    """
    main = None
    previous_i = 0
    current_i = 0
    i = 0
    n = 0
    cnt = 0
    dict_metadata = {}
    
    for index, row in files.iterrows():
        try:
            dict_metadata[index] =
                parse_XML_metadata(
                    path = row["metadata_path"],
                    met_dir = row["metadata_dir"], 
                    title = row["metadata_name"],
                    index = index)
        except Exception:
            continue
        # Each X, save the file in a .ftr
        if (i == 1000):
            current_i = current_i + i
            file_path = main_path[0]+"/data/processed/processed_metadata/processed_metadata_test"+str(previous_i)+"_"+str(current_i)+".ftr"
            main = pd.DataFrame.from_dict(dict_metadata).T.reset_index()
            main.to_feather(file_path)
            #return(main)
            main = None
            previous_i = current_i
            i = 0
        # Each 100 files, print the progress
        if (i % 100 == 0):
            print("Files parsed: "+str(50*cnt))
            print("Current file: "+row["metadata_name"]+"\n")
            cnt += 1
        i += 1

In [115]:
#article_names.reset_index(inplace=True)
iterate_files(article_names)

Files parsed: 0
Current file: DDD_110637387_0004_articletext.xml

Files parsed: 2000
Current file: DDD_010537363_0096_articletext.xml

Files parsed: 4000
Current file: DDD_010475606_0001_articletext.xml

Files parsed: 6000
Current file: DDD_010553991_0005_articletext.xml

Files parsed: 8000
Current file: DDD_010865369_0055_articletext.xml



Unnamed: 0,type,text,article_name,date,index,filepath,dir
0,title,,DDD_110637387_0004_articletext.xml,1950-04-27,0,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387
1,p,"Buiten Sociëteit „Het Park"" Dansgelegenheid vo...",DDD_110637387_0004_articletext.xml,1950-04-27,0,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387
2,title,Aanvaring,DDD_110637387_0039_articletext.xml,1950-04-27,1,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387
3,p,Hongkong Het Chinese 1 communistische persbure...,DDD_110637387_0039_articletext.xml,1950-04-27,1,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387
4,title,Dokter Raatgever vrijgesproken,DDD_110637387_0043_articletext.xml,1950-04-27,2,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387
...,...,...,...,...,...,...,...
26470,p,">. * , * -* i, ■ ' ° j ' r ''' 'i -• '••• . • ...",DDD_011155306_0081_articletext.xml,1950-03-08,9998,../data/1950/03-08/DDD_011155306/DDD_011155306...,../data/1950/03-08/DDD_011155306
26471,title,Korte Gemeenteraadsvergadering De heer Kooy wi...,DDD_011155306_0027_articletext.xml,1950-03-08,9999,../data/1950/03-08/DDD_011155306/DDD_011155306...,../data/1950/03-08/DDD_011155306
26472,p,HILVERSUM. — Precies een uur heeft de gemeente...,DDD_011155306_0027_articletext.xml,1950-03-08,9999,../data/1950/03-08/DDD_011155306/DDD_011155306...,../data/1950/03-08/DDD_011155306
26473,title,Hilversum vaardigt geen spelers af voor provin...,DDD_011155306_0020_articletext.xml,1950-03-08,10000,../data/1950/03-08/DDD_011155306/DDD_011155306...,../data/1950/03-08/DDD_011155306


In [22]:
iterate_metadata(metadata_files)

Files parsed: 0
Current file: DDD:ddd:110637387:mpeg21.didl.xml.gz.xml



KeyboardInterrupt: 

## Text selection model

## Ingest parsed files previously saved

Once we parse all the files present in the example `data-1950` folder, we produce 65 files containing the parsed original data into a format which is more easily readable by a machine. The total weight of the files is 65*10=650MB which is a 5x reduction from the original size of the dataset.

In [18]:
# https://www.sbert.net/docs/
from sentence_transformers import SentenceTransformer, LoggingHandler, util

# These are the pure transformers from huggingface
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# For saving
import pickle
import csv

# Set searborn settings
rcParams['figure.figsize'] = 12, 8

# Set fixed random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Find GPU on device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

2020-09-09 16:13:54 - PyTorch version 1.6.0 available.


## Read saved files

#### Retrieve all the names of the ftr files saved

In [12]:
ftr_articles = iterate_directory("/data/processed/processed_articles",".ftr")
df_articles = pd.DataFrame(ftr_articles)
df_articles.rename({'article_name': 'ftr_name', 'article_path': 'ftr_path', 'article_dir': 'ftr_dir'}, axis=1, inplace=True)

In [13]:
ftr_metadata = iterate_directory("/data/processed/processed_metadata",".ftr")
df_metadata = pd.DataFrame(ftr_metadata)
df_metadata.rename({'article_name': 'ftr_name', 'article_path': 'ftr_path', 'article_dir': 'ftr_dir'}, axis=1, inplace=True)

#### Retrieve all the content of the files into a list format

Read one ftr file as a test

In [15]:
def iterate_ftr(df):
    list_ftr = []
    for index, row in df.iterrows():
        #to delete - just for test
        if index > 1:
            break
        
        else:
            ftr = pd.read_feather(row["ftr_path"])
            list_ftr.append(ftr)
    
    return(ftr)

In [16]:
df_test_articles = iterate_ftr(df_articles)
df_test_metadata = iterate_ftr(df_metadata)

In [17]:
df_test_metadata.drop(["level_0", "date", "index"], axis=1, inplace=True)
df_test_metadata.rename({"filepath": "metadata_filepath"}, axis=1, inplace=True)

In [18]:
df_test_joined = df_test_articles.merge(df_test_metadata, how='left', on='dir')

#### Now we have (one)a merged file

Although this will be needed to be done recursively, for all files present in the database.

So efficiency is key here!!!

### Find synonym(s) for the key search word(s)

In [20]:
import nl_core_news_lg
import spacy
from tqdm import tqdm
import hdbscan

ModuleNotFoundError: No module named 'nl_core_news_lg'

In [18]:
nlp = nl_core_news_lg.load()

In [35]:
ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings['energie']]]), n=10)
list_synonyms = [nlp.vocab.strings[w] for w in ms[0][0]]

#### Create a list with only the text (paragraphs) and not the other variables

Retrieve all the paragraphs into one single file

In [33]:
df_test_joined

Unnamed: 0,article_name,date,index,filepath,dir,title,p_1,p_2,p_3,{urn:mpeg:mpeg21:2002:02-DIDL-NS}Item,...,metadata_title,metadata_filepath,newspaper_title,newspaper_date,newspaper_city,newspaper_publisher,newspaper_source,newspaper_volume,newspaper_issuenumber,newspaper_language
0,DDD_110637387_0004_articletext.xml,1950-04-27,0,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387,,"Buiten Sociëteit „Het Park"" Dansgelegenheid vo...",,,,...,DDD:ddd:110637387:mpeg21.didl.xml.gz.xml,../data/1950/04-27/DDD_110637387/DDD:ddd:11063...,De West : nieuwsblad uit en voor Suriname,1950-04-27,Paramaribo,W. Kraan,Koninklijke Bibliotheek,41,5087,nl
1,DDD_110637387_0039_articletext.xml,1950-04-27,1,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387,Aanvaring,Hongkong Het Chinese 1 communistische persbure...,,,,...,DDD:ddd:110637387:mpeg21.didl.xml.gz.xml,../data/1950/04-27/DDD_110637387/DDD:ddd:11063...,De West : nieuwsblad uit en voor Suriname,1950-04-27,Paramaribo,W. Kraan,Koninklijke Bibliotheek,41,5087,nl
2,DDD_110637387_0043_articletext.xml,1950-04-27,2,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387,Dokter Raatgever vrijgesproken,Hedenmorgen stonden dokter Raatgever en de hee...,,,,...,DDD:ddd:110637387:mpeg21.didl.xml.gz.xml,../data/1950/04-27/DDD_110637387/DDD:ddd:11063...,De West : nieuwsblad uit en voor Suriname,1950-04-27,Paramaribo,W. Kraan,Koninklijke Bibliotheek,41,5087,nl
3,DDD_110637387_0067_articletext.xml,1950-04-27,3,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387,De Avonturen van Kapitein Rob,"Terwijl het ..Levende Eiland"" met grote snelhe...",,,,...,DDD:ddd:110637387:mpeg21.didl.xml.gz.xml,../data/1950/04-27/DDD_110637387/DDD:ddd:11063...,De West : nieuwsblad uit en voor Suriname,1950-04-27,Paramaribo,W. Kraan,Koninklijke Bibliotheek,41,5087,nl
4,DDD_110637387_0020_articletext.xml,1950-04-27,4,../data/1950/04-27/DDD_110637387/DDD_110637387...,../data/1950/04-27/DDD_110637387,,"TRY AMERICA'S mm ït£jëk iï~££'U""% ■.. «■ i.t:P...",,,,...,DDD:ddd:110637387:mpeg21.didl.xml.gz.xml,../data/1950/04-27/DDD_110637387/DDD:ddd:11063...,De West : nieuwsblad uit en voor Suriname,1950-04-27,Paramaribo,W. Kraan,Koninklijke Bibliotheek,41,5087,nl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,DDD_010950576_0024_articletext.xml,1950-10-26,29996,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,Jean Dulieu. 1377.,"*'■ 1377,""-.""Daar. Pieter;no*t'nooit een' leeu...",,,,...,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
29997,DDD_010950576_0111_articletext.xml,1950-10-26,29997,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,"Nieuw Lagerhuis, maar met de eeuwenoude tradities","(Van onze Londense correspondent, \',ï\\£ï **•...",10 Mei is niet alleen voor Nederland een belan...,Churchill verzocht het parlement om stricte ge...,,...,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
29998,DDD_010950576_0063_articletext.xml,1950-10-26,29998,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,,Gedemobiliseerden vragen Uw aandacht WIE HELPT...,,,,...,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl
29999,DDD_010950576_0164_articletext.xml,1950-10-26,29999,../data/1950/10-26/DDD_010950576/DDD_010950576...,../data/1950/10-26/DDD_010950576,„De Wereld heeft geen Wachtkamer” Toneelstuk v...,(Van onze kunstredactie) • „De Wereld heeft ge...,"Het verhaal van de jonge geleerde,, die zich v...",,,...,DDD:ddd:010950576:mpeg21.didl.xml.gz.xml,../data/1950/10-26/DDD_010950576/DDD:ddd:01095...,Het vrĳe volk : democratisch-socialistisch dag...,1950-10-26,Rotterdam,De Arbeiderspers,Gemeentearchief Rotterdam,6,1661,nl


In [23]:
def list_paragraphs(df):
    list_p = []

    for index, row in df.iterrows():
        for i in range(1,df.shape[1]):
            p = "p_"+str(i)
            try:
                if row[p] and row[p] is not None:
                    list_p.append(row[p])
            except KeyError as e:
                continue

    return(list_p)

In [28]:
def list_title(df):
    list_titles = []

    for index, row in df.iterrows():
        try:
            if row["title"] and row["title"] is not None:
                list_titles.append(row["title"])
        except KeyError as e:
            continue

    return(list_titles)

In [29]:
#pars = list_paragraphs(df_test_joined)
titles = list_title(df_test_joined)

In [30]:
titles

['Aanvaring',
 'Dokter Raatgever vrijgesproken',
 'De Avonturen van Kapitein Rob',
 'De gestolen Nobelprijs De man, die Carl von Ossietzky bedroog, wordt ontmaskerd (van onze correspondent) Berlijn, April 1950',
 'Waterstand 28 April 1930',
 'Onafhankelijke republiek?',
 'Vitaminen tegen ouderdom',
 'Hof van Justitie De moord aan de Kwattaweg',
 'Rede Lauriers',
 'Mars door de stad',
 'Rede Lachmon',
 'Groep III St. Joris uitvoering',
 'Directeur van N.O. polder verwacht',
 'Rede Karamat Ali',
 'Dollars voor plan Nickerie en Lelydorp',
 'Rede Soemita',
 'GEEN GROTE EMIGRATIEMOGELIJKHEDEN NAAR SURINAME',
 'Rede Wijngaarde',
 'SARAKREEK GOUDVELDEN N.V.',
 'TELEGRAM DOOR HOOFDBESTUUR SSV AAN U IS VERZONDEN ZONDER MEDEWETEN VAN LEDEN S. ROBLES I DEBIE TH BIERMAN',
 'Meer arrestaties in Hongarije',
 'India en Pakistan II',
 'Voetbal op het Mr. Bronsplein',
 'Een valse mensenvriend.',
 'Surinaamse Zeemans Bond',
 'Tenslotte toch ontmaskerd',
 'Vier milioen slachtoffers',
 "Auto's tegen elkaa

In [47]:
indices = [i for i, x in enumerate(list_sentences) if x == "50"]

In [64]:
test = pd.DataFrame(list_sentences).reset_index()
test.rename({0: 'text'}, axis=1, inplace=True)

In [66]:
test1 = test[test["text"].str.contains("energie")]
test2 = test[test["text"].str.contains("energie|Britain", na=False)]
test1

Unnamed: 0,index,text
122,122,"Het is natuurlijk waar, dat de inzet van mense..."
437,437,Botsing:. — Op de hoek BreukerwegTTCarboonstra...
462,462,(AJvertentie) Rheumatiek ondermijnt Uw hele ge...
726,726,\7 OOR de grote meerderheid der sterren zou me...
731,731,"Professor Auguste Piccard, de beroemde stratos..."
...,...,...
16465,16465,"Men mag met. stelligheid aannemen, dat de rege..."
16501,16501,Vitanol Pillen: . geven levensl. en vern. Uw k...
16603,16603,Ongeveer een jaar geleden heeft het werk van d...
16607,16607,"Vanochtend om 5.30 uur bleek, dat «lechts één ..."


In [55]:
# Save to CSV
test.to_csv(path[0]+"/data"+"/list_sentences.csv",
            sep=",",
            quotechar='"',
            header=["index","text"],
            index=False)

#with open(path[0]+"/data"+"/list_sentences.csv", 'w') as myfile:
#    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#    wr.writerow(list_sentences)

#with open('list_sentences.pkl', "wb") as fOut:
#    pickle.dump(list_sentences, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
# Temp list
sen = list_sentences

### Use the multilingual model pre-trained on 10+ languages

### Play around with `SBERT`

The model is the `distiluse-base-multilingual-cased` model. From (sbert)[https://www.sbert.net/docs/pretrained_models.html]

In [23]:
# Create embeddings
model = SentenceTransformer('distiluse-base-multilingual-cased', device=device)

# Load paragraphs
sentences = sen

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

2020-09-01 14:57:56 - Load pretrained SentenceTransformer: distiluse-base-multilingual-cased
2020-09-01 14:57:56 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-09-01 14:57:56 - Load SentenceTransformer from folder: /Users/leonardovida/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distiluse-base-multilingual-cased.zip
2020-09-01 14:57:56 - loading configuration file /Users/leonardovida/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distiluse-base-multilingual-cased.zip/0_DistilBERT/config.json
2020-09-01 14:57:56 - Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "output_past": true,


HBox(children=(FloatProgress(value=0.0, description='Batches', max=2.0, style=ProgressStyle(description_width=…


Sentence: t/„.t„;j.'_„ Gisteren heeft Verleiding... Dagwacnt op één van zyn tochten door de stad alle mogelyke moeite gedaan de parkjes te omzeilen. De verleiding was groot om neer te zinken op één van de banken en te genieten van de zalige zomers aandoende zonneschyn. Laat hy u vertellen, dat hij niet bezweken is, maar manmoedig doorstapte naar het uitgestippelde doel. Maar zomer was het. De kinderen, die met meester of juffrouw wandelden, maakten extra hard lawaai, stampten met de schoenen, zongen en sprongen. De kinderwagens met de jonge spruiten erin stonden in de stadsparken wiel aan wiel. De jonge moeders keuvelden en breiden. Opmerkeiyk hoe goed dat samen gaat. Maar laten we nu stoppen. We hebben vandaag nog meer te doen. Het doel van gistermiddag was een gouden bruiloft, die op komst is. Gouden feest ™™£* straat 149 ni woont het echtpaar K. F. de Dood—Christina Johanna Elders. Vandaag, Woensdag 8 Maart, gaan de bruidsdagen in. En de heuglijke dag, waarop zy vflftig jaar gelede

In [34]:
# Sentence mining from sentence-transformers
sentences = sen[1:40]
paraphrases = util.paraphrase_mining(
    model,
    sentences,
    corpus_chunk_size=20, #len(sentences)
    query_chunk_size=20,
    top_k=20,
    max_pairs=5)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

Maar het waren juist deze geheime gegevens, die Louwers in handen wilde krijgen. Want de Tsjechoslowaakse industrie heeft op dit gebied een reusachtige voorsprong. Of nemen wij Philips! Toen de Duitsers ons land binnenvielen, vertrokken uit Eindhoven 30 auto's, volgeladen met geheime papieren: de productiegeheimen van Philips! Die moesten in veiligheid. De rest was niet belangrijk. Geen mens buiten de vesting van Philips komt over deze geheimen iets te weten. En binnen de vesting is een wetenschappelijk systeem uitgewerkt, dat er voor zorgt, dat de lagere ingenieurs en technici slechts gedeelten van een bepaald productieproces leren kennen, waarmee zij niets kunnen beginnen. Verteld wordt, dat eens een delegatie van Japanse belangstellenden Philips kwam bezoeken. De heren werden „overal" rondgeleid, ze kregen een pracht van een diner aangeboden. Maar de Philipsdirectie vermoedde dat sommigen van hen micro-fototoestellen by zich hadden. Fouilleren kon men de „geëerde gasten" niet. Dus w

In [None]:
#with open('paraphrase_test.pkl', "wb") as fOut:
#    pickle.dump(paraphrase, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
# Save embeddings
import pickle

with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

    
    
#Load sentences & embeddings from disc
with open('embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

Maar het waren juist deze geheime gegevens, die Louwers in handen wilde krijgen. Want de Tsjechoslowaakse industrie heeft op dit gebied een reusachtige voorsprong. Of nemen wij Philips! Toen de Duitsers ons land binnenvielen, vertrokken uit Eindhoven 30 auto's, volgeladen met geheime papieren: de productiegeheimen van Philips! Die moesten in veiligheid. De rest was niet belangrijk. Geen mens buiten de vesting van Philips komt over deze geheimen iets te weten. En binnen de vesting is een wetenschappelijk systeem uitgewerkt, dat er voor zorgt, dat de lagere ingenieurs en technici slechts gedeelten van een bepaald productieproces leren kennen, waarmee zij niets kunnen beginnen. Verteld wordt, dat eens een delegatie van Japanse belangstellenden Philips kwam bezoeken. De heren werden „overal" rondgeleid, ze kregen een pracht van een diner aangeboden. Maar de Philipsdirectie vermoedde dat sommigen van hen micro-fototoestellen by zich hadden. Fouilleren kon men de „geëerde gasten" niet. Dus w

### Playing around with BERTje

In [18]:
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased")
model = BertModel.from_pretrained("wietsedv/bert-base-dutch-cased")

2020-09-02 11:23:32 - Lock 5397104528 acquired on /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2.lock
2020-09-02 11:23:32 - https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmppdbo_bik


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241440.0, style=ProgressStyle(descripti…

2020-09-02 11:23:33 - storing https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt in cache at /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2
2020-09-02 11:23:33 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2
2020-09-02 11:23:33 - Lock 5397104528 released on /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74eaf9a4f375cd3be28b2.1e00a56207196ed1759c49bdd1fa93c2fb20273d59fabb0c4c8092f7beb773c2.lock
2020-09-02 11:23:33 - loading file https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt from cache at /Users/leonardovida/.cache/torch/transformers/75d9be4cc7910048b3bdd477c435ffc46330193705f74




2020-09-02 11:23:34 - Lock 5357248464 acquired on /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119.lock
2020-09-02 11:23:34 - https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmpqyn8q3s_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

2020-09-02 11:23:34 - storing https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json in cache at /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119
2020-09-02 11:23:34 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119
2020-09-02 11:23:34 - Lock 5357248464 released on /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d9811ba62f16257ea58e36dedceffd71290a6a.1a78bd120fe46d78b55efa59f4ffa1dafcc9242743ab9fd6629d1b56672c9119.lock
2020-09-02 11:23:34 - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json from cache at /Users/leonardovida/.cache/torch/transformers/6702c5c53edb76b65d71f73ff2d




2020-09-02 11:23:35 - Lock 5390180048 acquired on /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53.lock
2020-09-02 11:23:35 - https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin not found in cache or force_download set to True, downloading to /Users/leonardovida/.cache/torch/transformers/tmp1wjgs76_


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438869143.0, style=ProgressStyle(descri…

2020-09-02 11:24:36 - storing https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin in cache at /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53
2020-09-02 11:24:36 - creating metadata file for /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53
2020-09-02 11:24:36 - Lock 5390180048 released on /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c03627733fd0712f078a4d3a31ad964550f50a6113efdf874ecbcf5ddf6b53.lock
2020-09-02 11:24:36 - loading weights file https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin from cache at /Users/leonardovida/.cache/torch/transformers/e5754f612ca0f16edba5b775fdddba806751f5e4b87c5e7f16cc0c8d8d17df4d.b7c036




2020-09-02 11:24:40 - All model checkpoint weights were used when initializing BertModel.

2020-09-02 11:24:40 - All the weights of BertModel were initialized from the model checkpoint at wietsedv/bert-base-dutch-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertModel for predictions without further training.
