<a href="https://colab.research.google.com/github/UniVR-DH/ADHLab/blob/main/lecture06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing Named Entity Recognition

We will use the Spacy Library:
https://spacy.io/usage/spacy-101


<img src="https://drive.google.com/uc?export=view&id=1m_EMdnI5C826kgqK7r5vB4TXnB0-Wq7W" alt="Intestazione con loghi istituzionali" width="525"/>

| Docente      | Insegnamento | Anno Accademico    |
| :---        |    :----   |          ---: |
| Matteo Lissandrini      | Laboratorio Avanzato di Informatica Umanistica       | 2023/2024   |

## Usual install and basic imports

In [1]:
%pip install wikipedia-api
%pip install spacy==3.7.0

Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0
Collecting spacy==3.7.0
  Downloading spacy-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting weasel<0.4.0,>=0.1.0 (from spacy==3.7.0)
  Downloading weasel-0.3.4-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spacy==3.7.0)
  Downloading cloudpathlib-0.16.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cloudpathlib, weasel, spacy
  Attempting uninstall: spacy
    Found existing

In [2]:
import wikipediaapi
import re

# Step 1: Import necessary libraries
#wiki_wiki = wikipediaapi.Wikipedia('en', user_agent='YourUserAgent/1.0')  # WRONG
# wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI, user_agent='YourUserAgent/1.0') # WRONG
wiki_wiki = wikipediaapi.Wikipedia('MyTestProjectName (my.name@univr.it)',
                                   'en',
                                   extract_format=wikipediaapi.ExtractFormat.WIKI)

# Step 2: Prepare a list of titles of Wikipedia pages
fantasy_literature_titles = [
    "Fantasy literature",
    "The Lord of the Rings",
    "Harry Potter",
    "A Song of Ice and Fire",
    "The Chronicles of Narnia",
    "The Hobbit",
    "Alice's Adventures in Wonderland",
    "The Wizard of Oz",
    "The Silmarillion",
    "Discworld",
    "His Dark Materials",
    "The Wheel of Time",
    "Earthsea",
    "The Once and Future King",
    "The Princess Bride",
    "The Name of the Wind",
    "Mistborn",
    "The Malazan Book of the Fallen",
    "The Kingkiller Chronicle",
    "The Inheritance Cycle"
]

# Step 3: Write code to download main content in plain text and create a collection of cleaned text
cleaned_texts = []

for title in fantasy_literature_titles:
    page_py = wiki_wiki.page(title)

    if page_py.exists():
        # Keep the original URL of the page
        original_url = page_py.fullurl

        # Remove brackets and their contents from the text
        content = re.sub(r'\[[^\]]*\]', '', page_py.text)

        # Remove newline characters and extra spaces
        clean_text = ' '.join(content.split())

        # Store the cleaned text along with the original URL
        cleaned_texts.append({'title': title, 'content': clean_text, 'url': original_url})
    else:
        print(f"Page '{title}' does not exist on Wikipedia.")

# Display the cleaned texts and their original URLs (optional)
# for idx, entry in enumerate(cleaned_texts, 1):
#     print(f"Text {idx} - Title: {entry['title']}\nURL: {entry['url']}\nContent:\n{entry['content']}\n{'='*50}\n")


In [5]:
from collections import defaultdict
import re

# Step 4: Build an inverted index of lowercase ngrams ignoring non-alphanumeric symbols
inverted_index = defaultdict(list)

NGRAM_SIZE = 5

for entry in cleaned_texts:
    title = entry['title']
    content = entry['content'].lower()
    url = entry['url']

    # Remove non-alphanumeric symbols
    content = re.sub(r'[^a-z0-9 ]', '', content)

    # Generate ngrams of size NGRAM_SIZE
    ngrams = [content[i:i+NGRAM_SIZE] for i in range(len(content)-(NGRAM_SIZE-1))]

    # Build inverted index
    for trigram in set(ngrams):  # Using set to remove duplicates
        inverted_index[trigram].append({'title': title, 'url': url})

# Display the inverted index (optional)
# for ngram, entries in inverted_index.items():
#     print(f"N-gram: {ngram}")
#     for entry in entries:
#         print(f"  Title: {entry['title']}, URL: {entry['url']}")


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(query, inverted_index, cleaned_texts):
    # Step 5: Given an input query, compute the TF-IDF for each page that matches at least one ngram
    query = query.lower()

    # Remove non-alphanumeric symbols
    query = re.sub(r'[^a-z0-9 ]', '', query)

    # Generate ngrams
    query_ngrams = [query[i:i+NGRAM_SIZE] for i in range(len(query)-(NGRAM_SIZE-1))]

    # Find matching trigrams in the inverted index
    matching_ngrams = set(query_ngrams).intersection(inverted_index.keys())

    # Collect titles and URLs of pages matching the trigrams
    matching_pages = []
    for ngram in matching_ngrams:
        matching_pages.extend(inverted_index[ngram])

    # Get unique titles and URLs
    unique_titles = list({page['title'] for page in matching_pages})
    unique_urls = list({page['url'] for page in matching_pages})

    # Extract content for the matching pages
    matching_contents = [entry['content'] for entry in cleaned_texts if entry['title'] in unique_titles]

    # Compute TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(matching_contents)

    # Get feature names (words) from the vectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Prepare a list of pairs with URL and TF-IDF score for each matching page
    result_list = []
    for i, title in enumerate(unique_titles):
        tfidf_values = tfidf_matrix[i].toarray()[0]
        page_tfidf = {feature_names[j]: tfidf_values[j] for j in range(len(feature_names))}
        result_list.append({'url': unique_urls[i], 'tfidf_score': page_tfidf})

    return result_list

# Example usage:
query = "fantasy adventure"
result = compute_tfidf(query, inverted_index, cleaned_texts)
print(len(result))


20


In [8]:
# Function to print the top-k URLs and their TF-IDF scores
def print_top_k_urls(result_list, k):
    # Sort the result list based on the sum of TF-IDF scores
    sorted_results = sorted(result_list, key=lambda x: sum(x['tfidf_score'].values()), reverse=True)

    # Print the top-k URLs and their TF-IDF scores
    for i in range(min(k, len(sorted_results))):
        url = sorted_results[i]['url']
        tfidf_value = sum(sorted_results[i]['tfidf_score'].values())
        print(f"Top {i+1} URL: {url}, TF-IDF Value: {tfidf_value}")

# Example usage:
query = "fantasy adventure"
k = 5  # Change k to the desired number of top URLs
result = compute_tfidf(query, inverted_index, cleaned_texts)
print_top_k_urls(result, k)


Top 1 URL: https://en.wikipedia.org/wiki/Earthsea, TF-IDF Value: 15.266600695829172
Top 2 URL: https://en.wikipedia.org/wiki/The_Name_of_the_Wind, TF-IDF Value: 14.107094592697385
Top 3 URL: https://en.wikipedia.org/wiki/The_Once_and_Future_King, TF-IDF Value: 13.80408119729509
Top 4 URL: https://en.wikipedia.org/wiki/The_Silmarillion, TF-IDF Value: 13.661325773324807
Top 5 URL: https://en.wikipedia.org/wiki/The_Inheritance_Cycle, TF-IDF Value: 13.65790134661919
