In [19]:
import spacy
import chardet
from spacy.matcher import Matcher


# Part A

In [20]:
nlp = spacy.load('en_core_web_md')

# Read the text with the correct encoding because there were some errors with some of the characters in ai_forecast1.txt and ai_forecast2.txt.

In [21]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']


# Detect encoding for each file
encoding_moby_dick = detect_encoding("Texts/mobyDick.txt")
encoding_ai_forecast1 = detect_encoding("Texts/ai_forecast1.txt")
encoding_ai_forecast2 = detect_encoding("Texts/ai_forecast2.txt")

# Read files with detected encoding
moby_dick_text = open("Texts/mobyDick.txt", encoding=encoding_moby_dick).read()
ai_forecast1_text = open("Texts/ai_forecast1.txt", encoding=encoding_ai_forecast1).read()
ai_forecast2_text = open("Texts/ai_forecast2.txt", encoding=encoding_ai_forecast2).read()


In [22]:
# Create SpaCy documents
moby_dick_doc = nlp(moby_dick_text)
ai_forecast1_doc = nlp(ai_forecast1_text)
ai_forecast2_doc = nlp(ai_forecast2_text)


In [23]:
def process_doc(doc):
    print("Tokens and POS:")
    for token in doc:
        print(f"{token.text}: {token.pos_}")

    print("\nSentences:")
    for i, sent in enumerate(doc.sents):
        print(f"Sentence {i + 1}: {sent.text}")

    print("\nNamed Entities:")
    for ent in doc.ents:
        print(f"{ent.text}: {ent.label_}")

    print("\nExplain POS and grammar:")
    for token in doc[:10]:
        print("\nToken:", token.text)
        print("POS:", token.pos_)
        print("Dependency:", token.dep_)
        print("Explanation:", spacy.explain(token.pos_))


def pattern_matching_spacy_docs(docs, patterns):
    matcher = Matcher(nlp.vocab)

    # Add patterns to the matcher with their string representation as IDs
    for pattern in patterns:
        pattern_id = str(pattern)
        matcher.add(pattern_id, [pattern])

    # Initialize a dictionary to store matches
    matches_by_pattern = {}

    # Process each SpaCy document
    for doc_number, doc in enumerate(docs, start=1):
        # Find matches in the document
        matches = matcher(doc)

        # Organize matches by pattern
        for match_id, start, end in matches:
            rule_id = nlp.vocab.strings[match_id]
            span = doc[start:end]

            # Find the sentence containing the match
            sentence = next(sent.text for sent in doc.sents if span.start >= sent.start and span.end <= sent.end)

            match_info = (span.text, doc_number, sentence)
            matches_by_pattern.setdefault(rule_id, []).append(match_info)

    # Display matches for each pattern with document numbers and sentences
    for pattern_id, matched_texts in matches_by_pattern.items():
        print(f"\nMatches for pattern {pattern_id}:")
        for text, doc_number, sentence in matched_texts:
            print(f"- '{text}' found in document {doc_number} in sentence: '{sentence}'\n")


In [24]:
process_doc(moby_dick_doc)

Tokens and POS:
It: PRON
will: AUX
be: AUX
seen: VERB
that: SCONJ
this: DET
mere: ADJ
painstaking: NOUN
burrower: NOUN
and: CCONJ
grub: NOUN
-: PUNCT
worm: NOUN
of: ADP

  : SPACE
a: DET
poor: ADJ
devil: NOUN
of: ADP
a: DET
Sub: PROPN
-: PUNCT
Sub: PROPN
appears: VERB
to: PART
have: AUX
gone: VERB
through: ADP
the: DET
long: ADJ

  : SPACE
Vaticans: PROPN
and: CCONJ
street: NOUN
-: PUNCT
stalls: NOUN
of: ADP
the: DET
earth: NOUN
,: PUNCT
picking: VERB
up: ADP
whatever: DET
random: ADJ

  : SPACE
allusions: NOUN
to: ADP
whales: NOUN
he: PRON
could: AUX
anyways: ADV
find: VERB
in: ADP
any: DET
book: NOUN
whatsoever: ADV
,: PUNCT

  : SPACE
sacred: ADJ
or: CCONJ
profane: ADJ
.: PUNCT
Therefore: ADV
you: PRON
must: AUX
not: PART
,: PUNCT
in: ADP
every: DET
case: NOUN
at: ADP
least: ADJ
,: PUNCT

  : SPACE
take: VERB
the: DET
higgledy: ADJ
-: PUNCT
piggledy: ADJ
whale: NOUN
statements: NOUN
,: PUNCT
however: ADV
authentic: ADJ
,: PUNCT
in: ADP

  : SPACE
these: DET
extracts: NOUN
,: PUNCT
f

In [25]:
process_doc(ai_forecast1_doc)

Tokens and POS:
Pune: PROPN
,: PUNCT
India: PROPN
,: PUNCT
Sept.: PROPN
13: NUM
,: PUNCT
2022: NUM
(: PUNCT
GLOBE: PROPN
NEWSWIRE: PROPN
): PUNCT
--: PUNCT
The: DET
global: ADJ
AI: PROPN
market: NOUN
size: NOUN
is: AUX
projected: VERB
to: PART
grow: VERB
from: ADP
USD: NOUN
387.45: NUM
billion: NUM
in: ADP
2022: NUM
to: ADP
USD: NOUN
1394.30: NUM
billion: NUM
in: ADP
2029: NUM
at: ADP
a: DET
CAGR: NOUN
of: ADP
20.1: NUM
%: NOUN
in: ADP
the: DET
forecast: NOUN
period: NOUN
.: PUNCT
Growing: VERB
investment: NOUN
in: ADP
AI: PROPN
technology: NOUN
by: ADP
enterprises: NOUN
of: ADP
all: DET
sizes: NOUN
across: ADP
industries: NOUN
to: PART
garner: VERB
momentum: NOUN
in: ADP
the: DET
next: ADJ
several: ADJ
years: NOUN
.: PUNCT
Fortune: PROPN
Business: PROPN
Insights: PROPN
™: NOUN
published: VERB
this: DET
information: NOUN
in: ADP
its: PRON
recent: ADJ
report: NOUN
,: PUNCT
titled: VERB
“: PUNCT
Artificial: PROPN
Intelligence: PROPN
Market: PROPN
Forecast: PROPN
,: PUNCT
2022: NUM
-: SYM

In [26]:
process_doc(ai_forecast2_doc)

Tokens and POS:
The: DET
global: ADJ
artificial: ADJ
intelligence: NOUN
market: NOUN
size: NOUN
was: AUX
$: SYM
93.5: NUM
billion: NUM
in: ADP
2021: NUM
.: PUNCT
And: CCONJ
according: VERB
to: ADP
Grand: PROPN
View: PROPN
Research: PROPN
,: PUNCT
Inc.: PROPN
,: PUNCT
it: PRON
is: AUX
projected: VERB
to: PART
expand: VERB
at: ADP
a: DET
compound: ADJ
annual: ADJ
growth: NOUN
rate: NOUN
(: PUNCT
CAGR: PROPN
): PUNCT
of: ADP
38.1: NUM
%: NOUN
from: ADP
2022: NUM
to: ADP
2030: NUM
.: PUNCT
The: DET
market: NOUN
is: AUX
driven: VERB
by: ADP
the: DET
increasing: VERB
number: NOUN
of: ADP
connected: ADJ
devices: NOUN
,: PUNCT
growing: VERB
demand: NOUN
for: ADP
personalized: ADJ
services: NOUN
,: PUNCT
and: CCONJ
a: DET
need: NOUN
for: ADP
real: ADJ
-: PUNCT
time: NOUN
solutions: NOUN
.: PUNCT
Additionally: ADV
,: PUNCT
advancements: NOUN
in: ADP
cloud: PROPN
computing: NOUN
technology: PROPN
,: PUNCT
along: ADP
with: ADP
developments: NOUN
in: ADP
deep: ADJ
learning: NOUN
algorithms: NOUN
,:

In [27]:
patterns = [
    # Pattern for "Artificial Intelligence"
    [{"LOWER": "artificial"}, {"LOWER": "intelligence"}],

    # Pattern for "AI" followed by a verb
    [{"LOWER": "ai"}, {"POS": "VERB"}],

    # Pattern for numbers followed by %
    [{"LIKE_NUM": True}, {"TEXT": "%"}],

    # Pattern for company names (heuristic)
    [{"ENT_TYPE": "ORG"}]
]

In [28]:
pattern_matching_spacy_docs([ai_forecast1_doc, ai_forecast2_doc], patterns)


Matches for pattern [{'ENT_TYPE': 'ORG'}]:
- 'GLOBE' found in document 1 in sentence: 'Pune, India, Sept. 13, 2022 (GLOBE NEWSWIRE) --'

- 'NEWSWIRE' found in document 1 in sentence: 'Pune, India, Sept. 13, 2022 (GLOBE NEWSWIRE) --'

- 'Microsoft' found in document 1 in sentence: 'Request a Sample Copy of the Research Report:
https://www.fortunebusinessinsights.com/enquiry/request-sample-pdf/artificial-intelligence-market-100114

Microsoft accelerates industry cloud strategy for healthcare with the acquisition of Nuance

April 12, 2021 | Microsoft News Center	

Share on Facebook (opens new window)
'

- 'Microsoft' found in document 1 in sentence: 'Request a Sample Copy of the Research Report:
https://www.fortunebusinessinsights.com/enquiry/request-sample-pdf/artificial-intelligence-market-100114

Microsoft accelerates industry cloud strategy for healthcare with the acquisition of Nuance

April 12, 2021 | Microsoft News Center	

Share on Facebook (opens new window)
'

- 'News' found in

# Part B

In [29]:
# Detect encoding for each file
encoding_balzac = detect_encoding("documents/HonoreDeBalzac.txt")
encoding_alice = detect_encoding("documents/AliceBrown.txt")
encoding_chesterton = detect_encoding("documents/Chesterton.txt")

balzac_text = open("documents/HonoreDeBalzac.txt", encoding=encoding_balzac).read()
alice_text = open("documents/AliceBrown.txt", encoding=encoding_alice).read()
chesterton_text = open("documents/Chesterton.txt", encoding=encoding_chesterton).read()

In [30]:
# Create SpaCy documents
balzac_doc = nlp(balzac_text)
alice_doc = nlp(alice_text)
chesterton_doc = nlp(chesterton_text)

In [31]:
def print_word_count(*docs):
    for i, doc in enumerate(docs, start=1):
        word_count = len([token for token in doc if not token.is_space])
        print(f"Document {i}: Word Count - {word_count} words")

# Assuming nlp is the SpaCy model loaded earlier
print_word_count(balzac_doc, alice_doc, chesterton_doc)

Document 1: Word Count - 7127 words
Document 2: Word Count - 5851 words
Document 3: Word Count - 6416 words


In [32]:
def compare_similarity_of_first_100_tokens(*doc_name_pairs):
    for i, (doc_name1, doc1) in enumerate(doc_name_pairs):
        for _, (doc_name2, doc2) in enumerate(doc_name_pairs[i+1:]):
            # Extract the first 100 tokens from each document
            tokens1 = [token.text for token in doc1[:100]]
            tokens2 = [token.text for token in doc2[:100]]

            # Convert token lists to strings for similarity comparison
            text1 = " ".join(tokens1)
            text2 = " ".join(tokens2)

            # Calculate and print the similarity between the first 100 tokens
            similarity = nlp(text1).similarity(nlp(text2))
            print(f"Similarity between {doc_name1} and {doc_name2} (First 100 Tokens): {similarity:.4f}")

docs = [("HonoreDeBalzac", balzac_doc), ("AliceBrown", alice_doc), ("Chesterton", chesterton_doc)]
compare_similarity_of_first_100_tokens(*docs)



Similarity between HonoreDeBalzac and AliceBrown (First 100 Tokens): 0.8915
Similarity between HonoreDeBalzac and Chesterton (First 100 Tokens): 0.8936
Similarity between AliceBrown and Chesterton (First 100 Tokens): 0.9286


In [33]:
def add_custom_entity(text, entity_text, label, model):
    # Process the text with the existing model
    doc = model(text)

    # Find the start and end positions of the entity text
    start = None
    end = None
    for token in doc:
        if token.text == entity_text:
            start = token.idx
            end = token.idx + len(token.text)
            break

    if start is not None and end is not None:
        # Add a custom entity to the model
        doc.ents = list(doc.ents) + [(start, end, label)]

        # Process the text with the updated model
        doc = model(text)

    return doc

sample_text = "Hello, unfortunately Nikola Velikov is not available at this moment. Please leave a message after the tone."

entity_text = "Nikola Velikov"

# Add the custom entity to the model
custom_entity_doc = add_custom_entity(sample_text, entity_text, "PERSON", nlp)

print("Entities in the document:")
for ent in custom_entity_doc.ents:
    print(f"{ent.text}: {ent.label_}")


Entities in the document:
Nikola Velikov: PERSON


# Part C

In [34]:
print("Pipeline components:", nlp.pipe_names)

Pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [35]:
@spacy.Language.component("print_longest_token")
def print_longest_token(doc):
    longest_token = max(doc, key=lambda token: len(token.text))
    print("Longest token:", longest_token.text)
    return doc

# Uncomment for the first time
nlp.add_pipe("print_longest_token", last=True)

print("Pipeline components:", nlp.pipe_names)

# Example text
text = "This is me being creative and smart using some long words like elephantiasis(i have no idea what it means, saw it on tiktok) and antidisestablishmentarianism."

# Process the text with the updated pipeline
doc = nlp(text)

Pipeline components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'print_longest_token']
Longest token: antidisestablishmentarianism
