# Generating the working dataset from the CLDW

#### **Generating the working dataset from the CLDW**

Clone (download) the `datasets` repo

In [1]:
!git clone https://github.com/SpaceTimeNarratives/datasets.git

Cloning into 'datasets'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 16 (delta 2), reused 12 (delta 1), pack-reused 0[K
Unpacking objects: 100% (16/16), 4.70 MiB | 9.79 MiB/s, done.


Change into the `datasets` directory

In [2]:
cd datasets

/content/datasets


Import necessary files

In [3]:
import os
import re
import string
import nltk
import shutil
import spacy
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Extract the CLDW zipped file 

In [4]:
data_dir = "LD80 - Full LD Corpus with geoparsing (v5)"
shutil.unpack_archive(f"{data_dir}.zip")

### Setting up the extraction pipeline

In [5]:
pip -q install -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m917.6/917.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.7/769.7 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m668.8/668.8 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.5/97.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [6]:
%run functions.py

In [7]:
# Get the list of placenames and geonouns
place_names = [name.strip().title().replace("'S", "'s") for name in open('LD_placenames.txt').readlines()] #read and convert to title case 
place_names += [name.upper() for name in place_names] #retain the upper case versions
geonouns = get_inflections([noun.strip() for noun in open('geo_feature_nouns.txt').readlines()])

# Get the list of positive and negative words from the sentiment lexicon
pos_words = [w.strip() for w in open('positive-words.txt','r', encoding='latin-1').readlines()[35:]]
neg_words = [w.strip() for w in open('negative-words.txt','r', encoding='latin-1').readlines()[35:]]

# Create a blank spacy English model
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")

# Define the patterns for the EntityRuler by labelling all the names with the tag PLNAME
patterns = [{"label": "PLNAME", "pattern": plname} for plname in set(place_names)]
patterns += [{"label": "GEONOUN", "pattern": noun} for noun in geonouns]
patterns += [{"label": "+EMOTION", "pattern": word} for word in pos_words]
patterns += [{"label": "-EMOTION", "pattern": word} for word in neg_words]

ruler.add_patterns(patterns)

## Extract and tag the paragraphs

In [8]:
from spacy import displacy
options = {'colors':BG_COLOR}

Define the `get_paragraph()` function that cleans the text and returns a list of text paragraphs

In [9]:
def get_paragraphs(input_text):
  paragraphs = []
  soup = BeautifulSoup(input_text, 'html.parser')
  
  # Define a regular expression pattern to match XML tags
  pattern = re.compile(r'<.*?>')

  for i, p in enumerate(soup.find_all('p')):
    # Use the sub() function to remove all tags from the XML text
    _text = re.sub(pattern, '', str(p))

    # use the nltk sentence tokenizer to segment the text into sentences
    _text = _text.replace('\n', ' '
              ).replace('\t', ' '
                  ).replace('∫', 's'
                      ).replace("\'", "'")

    # Replace multiple spaces with one space
    paragraphs.append(re.sub(r'\s+', ' ', _text))

  # Split into sentences, strip leading and trailing non-printables and return 
  return paragraphs

Build the `data` dictionary by
1. Reading all files in the `gold_standard\` folder
2. Applying the `get_paragraphs()` function to return the paragraphs in each file.
3. For each paragragh, store the `paraId`, `text`,  and `word_count`

In [14]:
paragraphs = []
for fileId, filename in enumerate(sorted(os.listdir(data_dir))):
  if filename.endswith(".xml"):
    text = open(f'{data_dir}/{filename}', 'r', encoding='utf8').read()
    for paraId, paragraph in enumerate(get_paragraphs(text)):
      paragraphs.append({'fileId':fileId,'paraId':paraId, 'text':paragraph, 'word_count':len(paragraph.split())})



Convert it to a Pandas dataframe for viewing

In [None]:
data = pd.DataFrame.from_dict(paragraphs)
# data

Update the `data` dictionary. For each file
1. Go through all the paragraphs
2. Extract all PLNAMES and GEONOUNS. +/-EMOTIONS
3. Store the list of each tags as well as the counts

In [19]:
def pre_process_text(text):
  return list(filter(lambda token: token not in string.punctuation,
             [lemma.lemmatize(word) for word in word_tokenize(text) 
             if word.lower() not in stop_words]))
# Get entity counts for each tag
def get_entities(text, tag):
  return [(ent, ent.start_char, ent.end_char) for ent in nlp(text).ents if ent.label_ == tag]
  
def add_entity_count(data_df, tag):
  ents = [get_entities(text, tag) for text in data_df['text']]
  counts = [len(count) for count in ents]
  return ents, counts

data['plnames'], data['pn_cnts'] = add_entity_count(data, 'PLNAME')
data['geonouns'], data['gn_cnts'] = add_entity_count(data, 'GEONOUN')
data['pos_emotions'], data['pos_cnts'] = add_entity_count(data, '+EMOTION')
data['neg_emotions'], data['neg_cnts'] = add_entity_count(data, '-EMOTION')
data['sentiment_score'] = (data['pos_cnts'] - data['neg_cnts'])/data['text'].apply(lambda x : len(pre_process_text(x)))

List all occurences of entities (place names, geonouns, emotions) with their file and paragraph IDs

In [20]:
# Define the `add_tag` function to attach the tag to each entity from a given list
add_tag = lambda x_list, tag: [(x,tag) for x in x_list]

In [None]:
entities_df = data[['plnames','geonouns', 'pos_emotions', 'neg_emotions','fileId', 'paraId']]
ent_list = []
for i in range(len(entities_df)): 
  plns, gns, pos, neg, fId, pId = entities_df.iloc[i]
  ents = add_tag(plns,'PLNAME')+add_tag(gns,'GEONOUN')+add_tag(pos,'+EMOTION'
                                                  ) + add_tag(neg, '-EMOTION')
  if ents:
    for ent, tag in ents:
      ent_list.append({'entity': ent[0], 'start_char':ent[1], 'end_char':ent[2], 
                      'fileId':fId, 'paraId':pId, 'tag':tag})
      

ent_list_df = pd.DataFrame.from_dict(ent_list)
# ent_list_df

In [31]:
from collections import defaultdict
entities_df = data[['plnames','geonouns', 'pos_emotions', 'neg_emotions','fileId', 'paraId']]
ent_list = defaultdict(list)
for i in range(len(entities_df)): 
  plns, gns, pos, neg, fId, pId = entities_df.iloc[i]
  ents = add_tag(plns,'PLNAME')+add_tag(gns,'GEONOUN')+add_tag(pos,'+EMOTION'
                                                     )+add_tag(neg,'-EMOTION')
  
  for ent, tag in ents:
      ent_list[tag].append({'entity': ent[0], 'start_char':ent[1], 'end_char':ent[2], 
                        'fileId':fId, 'paraId':pId})

# Generate dataframes for all entities
ent_dfs = [(tag, pd.DataFrame.from_dict(ent_list[tag])) for tag in ent_list]

Export to Excel and JSON

In [40]:
with pd.ExcelWriter('paragraph_counts_v1.xlsx') as writer:  
    data.to_excel(writer, sheet_name='paragraphs')
    for tag, ent_df in ent_dfs:
      ent_df.to_excel(writer, sheet_name=tag)
      ent_df.to_json(f"{tag.lower()}.json", default_handler=str, orient="records")