In [1]:
from src.stanza_spacy_compare import *
from nltk.probability import FreqDist
from functools import reduce
from tqdm import tqdm
import pandas as pd
import pickle
import stanza
import spacy
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path used to store csv files requested for the project.
CSV_TEXT_PATH = 'data/part2/texts.csv'
CSV_SHARED_SENTENCE_PATH = 'data/part2/shared_sentences.csv'

## Data collection from scratch

This part of the notebook loading text files using the requested function (create_textset).

It also apply spacy and stanza on each text.

We advise you to stay with a small N value because stanza processing is very slow.

Further in the notebook there is a section wich load precomputed text from "sm" sample

In [3]:
# Which folder to use to load text from files.
TEXT_FOLDER_PATH = 'data/sm/pages'
# Number of file/text to load.
N = 10
# Name of spacy model to use (https://spacy.io/models)
SPACY_MODEL_NAME = 'en_core_web_sm'
# Name of stanza model to use (https://stanfordnlp.github.io/stanza/available_models.html)
STANZA_MODEL_NAME = 'en' 
# As stanza model is really slow we reduce the number of component.
STANZA_MODEL_PROCESSORS = 'tokenize,pos,lemma,depparse'

### Load spacy and stanza pipelines

In [4]:
sp = spacy.load(SPACY_MODEL_NAME)

In [5]:
st = stanza.Pipeline(STANZA_MODEL_NAME, processors=STANZA_MODEL_PROCESSORS)

2023-05-16 21:48:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 54.0MB/s]                    
2023-05-16 21:48:22 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2023-05-16 21:48:22 INFO: Using device: cpu
2023-05-16 21:48:22 INFO: Loading: tokenize
2023-05-16 21:48:22 INFO: Loading: pos
2023-05-16 21:48:22 INFO: Loading: lemma
2023-05-16 21:48:22 INFO: Loading: depparse
2023-05-16 21:48:23 INFO: Done loading processors!


### Load texts

In [6]:
# Use create_textset to create a dataframe with text from files
text_df = pd.DataFrame(create_textset(TEXT_FOLDER_PATH, N), columns=['text'])
text_df.head()

Unnamed: 0,text
0,Arthur Hamilton Gibbs (9 March 1888 – 24 May 1...
1,Akkamappettai Paramasivan Nagarajan (24 Februa...
2,Aasif Sheikh (Nepali: आसिफ शेख; born 22 Januar...
3,Adam Morris (also known as Wayne Morris) is a ...
4,"Adriana Farmiga ( far-MEE-gə; born July 17, 19..."


### Apply Spacy

In [7]:
text_df['spacy'] = text_df['text'].progress_apply(sp)

100%|██████████| 10/10 [00:01<00:00,  7.47it/s]


### Apply Stanza

In [8]:
text_df['stanza'] = text_df['text'].progress_apply(st)

100%|██████████| 10/10 [01:29<00:00,  8.92s/it]


### Save text DataFrame in csv file

In [9]:
text_df.to_csv(CSV_TEXT_PATH, index=False)
pd.read_csv(CSV_TEXT_PATH).head()

Unnamed: 0,text,spacy,stanza
0,Arthur Hamilton Gibbs (9 March 1888 – 24 May 1...,Arthur Hamilton Gibbs (9 March 1888 – 24 May 1...,"[\n [\n {\n ""id"": 1,\n ""text"": ""..."
1,Akkamappettai Paramasivan Nagarajan (24 Februa...,Akkamappettai Paramasivan Nagarajan (24 Februa...,"[\n [\n {\n ""id"": 1,\n ""text"": ""..."
2,Aasif Sheikh (Nepali: आसिफ शेख; born 22 Januar...,Aasif Sheikh (Nepali: आसिफ शेख; born 22 Januar...,"[\n [\n {\n ""id"": 1,\n ""text"": ""..."
3,Adam Morris (also known as Wayne Morris) is a ...,Adam Morris (also known as Wayne Morris) is a ...,"[\n [\n {\n ""id"": 1,\n ""text"": ""..."
4,"Adriana Farmiga ( far-MEE-gə; born July 17, 19...","Adriana Farmiga ( far-MEE-gə; born July 17, 19...","[\n [\n {\n ""id"": 1,\n ""text"": ""..."


## Data collection from sm

In [10]:
# Which sample to loads.
SAMPLE_NAME = 'sm'
# Which type of text to use. "abstract" or "page"
TEXT_KEY = 'page'

In [11]:
# Use create_textset to create a dataframe with text from files
text_df = pd.DataFrame(
    [(doc[TEXT_KEY].text, doc[TEXT_KEY]) for doc in pickle.loads(Path(f'data/{SAMPLE_NAME}/spacy.pkl').read_bytes())], 
    columns=['text', 'spacy']
)
text_df.head()

Unnamed: 0,text,spacy
0,Jason Edward Feddy (born 22 February 1966) is ...,"(Jason, Edward, Feddy, (, born, 22, February, ..."
1,Robert Scarlett (born 14 January 1979) is a Ja...,"(Robert, Scarlett, (, born, 14, January, 1979,..."
2,"Bahram Sadeghi (Persian: بهرام صادقی, romanize...","(Bahram, Sadeghi, (, Persian, :, بهرام, صادقی,..."
3,Nadav Asher Eyal (Hebrew: נדב אשר איל; born M...,"(Nadav, Asher, Eyal, , (, Hebrew, :, נדב, אשר..."
4,David Nthubu Koloane (5 June 1938 – 30 June 20...,"(David, Nthubu, Koloane, (, 5, June, 1938, –, ..."


In [12]:
text_df['stanza'] = [doc[TEXT_KEY] for doc in pickle.loads(Path(f'data/{SAMPLE_NAME}/stanza.pkl').read_bytes())]

## Sentence segmentation

### Compute len of sentences for both library

In [13]:
# Apply compute_sent_len and store results in text_df new columns
text_df[['sp_sent_len', 'st_sent_len']] = text_df.progress_apply(lambda x: compute_sent_len(x[1], x[2]), axis=1).tolist()

100%|██████████| 1000/1000 [00:00<00:00, 25001.37it/s]


In [14]:
text_df.describe()

Unnamed: 0,sp_sent_len,st_sent_len
count,1000.0,1000.0
mean,27.694,29.993
std,40.831747,42.627504
min,1.0,1.0
25%,8.0,9.0
50%,16.0,19.0
75%,32.0,34.0
max,591.0,592.0


### Compute shared sentences for both library

In [15]:
# Apply compute_shared_sentences, reduce results and store it in a Dataframe
shared_sentences = text_df.progress_apply(lambda x: compute_shared_sentences(x[1], x[2]), axis=1)
shared_sentence_df = pd.DataFrame(reduce(lambda a, b: a + b, shared_sentences, []), columns=['shared_sentence'])
shared_sentence_df.head()

100%|██████████| 1000/1000 [00:00<00:00, 2147.89it/s]


Unnamed: 0,shared_sentence
0,"""By the time Jason Feddy starts, [its] not jus..."
1,A review by the New York Times described Feddy...
2,"An affable sort of a guy with a soft, reedy sp..."
3,"The band opened to rave reviews: ""We included ..."
4,His mother was (and still is) an active member...


In [16]:
# Check how many document doesn't share sentence
text_df['shared_sentences_len'] = pd.Series(shared_sentences).apply(len)
text_df.describe()

Unnamed: 0,sp_sent_len,st_sent_len,shared_sentences_len
count,1000.0,1000.0,1000.0
mean,27.694,29.993,16.068
std,40.831747,42.627504,25.736389
min,1.0,1.0,0.0
25%,8.0,9.0,3.0
50%,16.0,19.0,9.0
75%,32.0,34.0,20.0
max,591.0,592.0,352.0


In [17]:
text_df[text_df['shared_sentences_len'] == text_df['sp_sent_len']]

Unnamed: 0,text,spacy,stanza,sp_sent_len,st_sent_len,shared_sentences_len
55,"Éric Lada (born October 14, 1965 in Chartres) ...","(Éric, Lada, (, born, October, 14, ,, 1965, in...","[\n [\n {\n ""id"": 1,\n ""text"": ""...",1,1,1


In [18]:
shared_sentence_df.to_csv(CSV_SHARED_SENTENCE_PATH, index=False)
pd.read_csv(CSV_SHARED_SENTENCE_PATH).head()

Unnamed: 0,shared_sentence
0,"""By the time Jason Feddy starts, [its] not jus..."
1,A review by the New York Times described Feddy...
2,"An affable sort of a guy with a soft, reedy sp..."
3,"The band opened to rave reviews: ""We included ..."
4,His mother was (and still is) an active member...


## Tokenization

### Vocabulary recognised by each library

In [19]:
spacy_vocab = compute_spacy_vocabulary(text_df['spacy'])
# Increase or remove slice to display more tokens
print(len(spacy_vocab))
list(spacy_vocab)[:25]

50149


['Fluoreszenz',
 'Invader',
 '"Gaines',
 'Dislocations',
 '156795066',
 'Korus',
 'Elder',
 'Henryk',
 'Volume',
 'Prosecutors',
 'Rajaye',
 'Jerome',
 'instant',
 '3.89',
 'Blacks',
 '1994,has',
 'Braque',
 'invocations',
 'abstraits',
 'Madeira',
 'Shadowmaker',
 'TheReggaeboyz',
 'Lagow',
 'Stray',
 'Kamwada']

In [20]:
# Increase or remove slice to display more tokens 
stanza_vocab = compute_stanza_vocabulary(text_df['stanza'])
print(len(stanza_vocab))
list(stanza_vocab)[:25]

49762


['Fluoreszenz',
 'Invader',
 'Dislocations',
 '156795066',
 '[...]',
 'Korus',
 'Elder',
 'Henryk',
 'Volume',
 'Prosecutors',
 'Rajaye',
 'Jerome',
 'instant',
 'treasury',
 '3.89',
 'Blacks',
 'Braque',
 'invocations',
 'abstraits',
 'Madeira',
 'Shadowmaker',
 'TheReggaeboyz',
 'Lagow',
 'Stray',
 'Kamwada']

In [21]:
shared_vocab = spacy_vocab & stanza_vocab
print(len(shared_vocab))
# Increase or remove slice to display more tokens
list(shared_vocab)[:25]

47882


['Fluoreszenz',
 'Invader',
 'Dislocations',
 '156795066',
 'Korus',
 'Elder',
 'Henryk',
 'Volume',
 'Prosecutors',
 'Rajaye',
 'Jerome',
 'instant',
 '3.89',
 'Blacks',
 'Braque',
 'invocations',
 'abstraits',
 'Madeira',
 'Shadowmaker',
 'TheReggaeboyz',
 'Lagow',
 'Stray',
 'Kamwada',
 '20.25',
 'concurring']

### The set of tokens that is specific to spacy

In [22]:
spacy_only_vocab = spacy_vocab - stanza_vocab
print(len(spacy_only_vocab))
# Increase or remove slice to display more tokens
list(spacy_only_vocab)[:25]

2267


['2⁄3',
 '"Gaines',
 'G-20',
 '30.Law',
 '"During',
 '42–35',
 '1994,has',
 '1902.As',
 'year—40,000',
 '65–51',
 '1956–1994',
 '1814–1867',
 'ordinator',
 '47.Palaszewski',
 '19–7',
 'for-3',
 'CLOUD',
 '\n \n\n\n',
 '7011',
 '1959Silver',
 'Import',
 'Opta',
 '67–67',
 '4230',
 '1925–1940']

### The set of tokens that is specific to stanza

In [23]:
stanza_only_vocab = stanza_vocab - spacy_vocab
print(len(stanza_only_vocab))
# Increase or remove slice to display more tokens
list(spacy_only_vocab)[:25]

1880


['2⁄3',
 '"Gaines',
 'G-20',
 '30.Law',
 '"During',
 '42–35',
 '1994,has',
 '1902.As',
 'year—40,000',
 '65–51',
 '1956–1994',
 '1814–1867',
 'ordinator',
 '47.Palaszewski',
 '19–7',
 'for-3',
 'CLOUD',
 '\n \n\n\n',
 '7011',
 '1959Silver',
 'Import',
 'Opta',
 '67–67',
 '4230',
 '1925–1940']

### The set of shared token occurrences without sentence segmentation

In [24]:
shared_token_by_docs = compute_token_by_doc(text_df['spacy'], text_df['stanza'])
# Increase most_common argument to display more result
FreqDist([t.text for t, _ in shared_token_by_docs]).most_common(25)

[(',', 3391),
 ('the', 2596),
 ('.', 2209),
 ('in', 1397),
 ('of', 1396),
 ('(', 1328),
 ('and', 1257),
 (')', 1254),
 ('a', 1219),
 ('is', 762),
 ('was', 734),
 ('born', 606),
 ('for', 506),
 ('to', 466),
 ('as', 466),
 ('He', 460),
 ('-', 404),
 ('an', 388),
 ('"', 372),
 ('–', 327),
 ('who', 272),
 ('his', 266),
 ('he', 231),
 (':', 230),
 ('from', 217)]

### The set of shared token occurrences with sentence segmentation

In [25]:
shared_token_by_sentences = compute_token_by_sentence(text_df['spacy'], text_df['stanza'])
# Increase most_common argument to display more result
FreqDist([t.text for t, _ in shared_token_by_sentences]).most_common(25)

[(',', 11251),
 ('the', 10320),
 ('.', 8373),
 ('in', 5685),
 ('of', 5590),
 ('and', 5034),
 ('a', 4028),
 ('was', 3152),
 ('to', 3145),
 ('He', 2445),
 ('he', 2088),
 ('for', 1942),
 ('(', 1887),
 ('his', 1767),
 (')', 1740),
 ('as', 1621),
 ('-', 1426),
 ('at', 1281),
 ('In', 1267),
 ('on', 1185),
 ('is', 1158),
 ('with', 1153),
 ('"', 1107),
 ("'s", 992),
 ('from', 962)]

## POS tagging

In [26]:
valid_pos_count, valid_pos_ratio, spacy_pos, stanza_pos = compute_valid_pos(shared_token_by_sentences)

In [27]:
# Number of equivalent pos, ratio of equivalent pos
valid_pos_count, valid_pos_ratio

(192555.0, 95.37)

In [28]:
# Stanza pos frequancy for spacy ADJ
spacy_pos['ADJ']

{'ADJ': 91.48,
 'ADP': 0.08,
 'ADV': 0.55,
 'AUX': 0.0,
 'CCONJ': 0.0,
 'DET': 0.01,
 'INTJ': 0.01,
 'NOUN': 3.81,
 'NUM': 0.02,
 'PART': 0.0,
 'PRON': 0.03,
 'PROPN': 1.57,
 'PUNCT': 0.24,
 'SCONJ': 0.0,
 'SYM': 0.0,
 'VERB': 2.03,
 'X': 0.16}

In [29]:
# Spacy pos frequancy for stanza ADJ
stanza_pos['ADJ']

{'ADJ': 72.55,
 'ADP': 0.06,
 'ADV': 0.73,
 'AUX': 0.0,
 'CCONJ': 0.0,
 'DET': 0.0,
 'INTJ': 0.0,
 'NOUN': 1.55,
 'NUM': 0.03,
 'PART': 0.0,
 'PRON': 0.0,
 'PROPN': 24.33,
 'PUNCT': 0.0,
 'SCONJ': 0.0,
 'SYM': 0.0,
 'VERB': 0.75,
 'X': 0.02}

In [30]:
# Display same tag frequency for each tags
# Tag, Spacy, Stanza
for tag in POS_TAGS:
    print(tag, spacy_pos[tag][tag], stanza_pos[tag][tag])

ADJ 91.48 72.55
ADP 96.87 98.73
ADV 94.31 82.2
AUX 99.07 99.85
CCONJ 99.78 99.37
DET 99.78 99.09
INTJ 23.53 30.77
NOUN 95.74 95.15
NUM 99.49 98.47
PART 97.17 98.44
PRON 98.52 99.52
PROPN 89.14 97.22
PUNCT 98.73 99.51
SCONJ 50.13 53.48
SYM 90.48 26.55
VERB 97.06 96.44
X 27.87 7.19


In [31]:
# Go deeper into X tag
print(spacy_pos['X'])
print(stanza_pos['X'])

{'ADJ': 1.64, 'ADP': 0.82, 'ADV': 0.82, 'AUX': 0.0, 'CCONJ': 0.0, 'DET': 0.0, 'INTJ': 0.0, 'NOUN': 4.1, 'NUM': 1.64, 'PART': 0.0, 'PRON': 0.0, 'PROPN': 35.25, 'PUNCT': 27.87, 'SCONJ': 0.0, 'SYM': 0.0, 'VERB': 0.0, 'X': 27.87}
{'ADJ': 3.17, 'ADP': 1.27, 'ADV': 1.06, 'AUX': 0.0, 'CCONJ': 0.0, 'DET': 0.63, 'INTJ': 0.21, 'NOUN': 21.99, 'NUM': 0.0, 'PART': 0.0, 'PRON': 0.42, 'PROPN': 59.41, 'PUNCT': 0.85, 'SCONJ': 0.0, 'SYM': 0.0, 'VERB': 3.81, 'X': 7.19}
