## (1) Data extraction

Extract text contained in json files and save it in a dataframe for further pre-processing.

In [25]:
%%capture
from tqdm.notebook import tqdm
tqdm.pandas()
import json
import argparse
import pandas as pd
import numpy as np

In [29]:
# Check the different keys in the json file
for id_file in tqdm(range(1,14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        data = json.load(f)

        # Loop over each document
        keys = []
        for i, doc in enumerate(data):
            for key, value in doc.items():
                keys.append(key)

        myset = set(keys)
        print("File {} - Keys: {} - Documents: {}".format(id_file, myset, len(data)))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

File 1 - Keys: {'text', 'uri'} - Documents: 11699
File 2 - Keys: {'text', 'uri'} - Documents: 32072
File 3 - Keys: {'text', 'uri'} - Documents: 8225
File 4 - Keys: {'text', 'uri'} - Documents: 77258
File 5 - Keys: {'text', 'uri'} - Documents: 46079
File 6 - Keys: {'text', 'uri'} - Documents: 28106
File 7 - Keys: {'text', 'uri'} - Documents: 27391
File 8 - Keys: {'text', 'uri'} - Documents: 24143
File 9 - Keys: {'text', 'uri'} - Documents: 22223
File 10 - Keys: {'text', 'uri'} - Documents: 20979
File 11 - Keys: {'text', 'uri'} - Documents: 57160
File 12 - Keys: {'text', 'uri'} - Documents: 85900
File 13 - Keys: {'text', 'uri'} - Documents: 793



In [30]:
rows = []
for id_file in tqdm(range(1, 14)):
    file_path = "./../Data/Original_data/" + str(id_file) + ".json"

    with open(file_path) as f:
        print("Loading file {}...".format(id_file))
        data = json.load(f)  # data is a list of dict of the form: {'text':['...'], 'uri':['...']}

        print("Extracting text from {} documents in file...".format(len(data), id_file))
        for i, doc in enumerate(data):
            text = doc.get('text') # Get the text of the current doc
            if text is not None:
                row_dict = {'Text': text[0]}
                rows.append(row_dict)

# Create dataframe
df = pd.DataFrame(rows)
df

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

Loading file 1...
Extracting text from 11699 documents in file...
Loading file 2...
Extracting text from 32072 documents in file...
Loading file 3...
Extracting text from 8225 documents in file...
Loading file 4...
Extracting text from 77258 documents in file...
Loading file 5...
Extracting text from 46079 documents in file...
Loading file 6...
Extracting text from 28106 documents in file...
Loading file 7...
Extracting text from 27391 documents in file...
Loading file 8...
Extracting text from 24143 documents in file...
Loading file 9...
Extracting text from 22223 documents in file...
Loading file 10...
Extracting text from 20979 documents in file...
Loading file 11...
Extracting text from 57160 documents in file...
Loading file 12...
Extracting text from 85900 documents in file...
Loading file 13...
Extracting text from 793 documents in file...



Unnamed: 0,Text
0,"Americas Headquarters: Cisco Systems, Inc., 17..."
1,For the latest version of the Cisco Small Busi...
2,"WebEx Meeting Center User Guide For Hosts, Pr..."
3,78-4019959-01 Rev D Prisma II 1550 nm SuperQA...
4,Release Notes for Cisco RV130/RV130W Firmware ...
...,...
442014,Cisco Unified Communications System for IP Tel...
442015,Cisco Unified Communications System for IP Tel...
442016,Cisco Unified Communications System for IP Tel...
442017,Cisco Unified Communications System for IP Tel...


## (2) Data Processing

In order to use "create_pretraining_data.py" from BERT repository, the input must be a plain text file, with one sentence per line (it is important that these be actual sentences for the "next sentence prediction" task). They advise to perform sentence segmentation with an off-the-shelf NLP toolkit such as spaCy.

### 2.1. Data Cleaning

In [None]:
# Analyze all special characters in the texts


In [32]:
#df.Text = df.Text.replace('\s+', ' ', regex=True)  # Remove duplicate spaces

### 2.2 Sentence Segmentation

In [None]:
# Sentence segmentation with spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(contents)
sentences = list(doc.sents)
for sentence in sentences:
    print(str(sentence)+'\n')

In [None]:
# Sentences segmentation with nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
sentences = sent_tokenize(contents)
for sentence in sentences:
    print(sentence+'\n')