In [1]:
import json
with open('../data/cleaned_wiki.json', 'r') as f:
    data = json.load(f)

In [2]:
#fonction to get the title of the paragraph
def get_title_by_id(paragraph_id, data):
    for section in data:
        for content_item in section["content"]:
            if content_item["type"].startswith("h") and content_item["ids"]:
                if paragraph_id in content_item["ids"]:
                    return content_item["title"]
    return None



#test 
paragraph_id_to_find = 41  
title = get_title_by_id(paragraph_id_to_find, data)

if title:
    print(f"The title for paragraph ID {paragraph_id_to_find} is: {title}")
else:
    print(f"No title found for paragraph ID {paragraph_id_to_find}")

The title for paragraph ID 41 is: program exampl


In [3]:
#fonction to get all the title related to the paragraph

def get_titles_by_id(paragraph_id, data):

    #set values
    page_title = data[0]["title"]  #the title of the page
    previous_h2_title = None #the title of the division containing subtitles
    paragraph_title = None #the title of the paragraph

    for section in data:
        for content_item in section["content"]:
            #verify the type of the title to check if it is a sub title
            if content_item["type"] == "h2":
                previous_h2_title = content_item["title"]
            elif content_item["type"] == "h3" and content_item["ids"] and paragraph_id in content_item["ids"]:
                paragraph_title = content_item["title"]
                return page_title, paragraph_title, previous_h2_title

    return page_title, paragraph_title, previous_h2_title

# test
paragraph_id_to_find = 41  
page_title, paragraph_title, previous_h2_title = get_titles_by_id(paragraph_id_to_find, data)

if paragraph_title:
    print(f"Page Title: {page_title}")
    print(f"Paragraph Title: {paragraph_title}")
    if previous_h2_title:
        print(f"Previous H2 Title: {previous_h2_title}")


In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json
from nltk.stem import PorterStemmer

STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    # convert to lowercase
    lower_cased = text.lower()
    # remove references in square brackets
    no_references = re.sub(r'\[.*?\]', '', lower_cased)
    # keep only alphanumeric characters
    alphanumeric = re.sub(r'\W+', ' ', no_references)
    # remove strange unicode characters
    alphanumeric = re.sub(r'[^\x00-\x7F]+', '', alphanumeric)
    # remove stopwords
    no_stopwords = ' '.join([word for word in alphanumeric.split() if word not in STOPWORDS])
    # stemmize
    stemmed = ' '.join([PorterStemmer().stem(word) for word in no_stopwords.split()])
    return stemmed

def get_n_gram(n, text):
    # get word n_gram
    words = text.split()
    n_gram = []
    for i in range(len(words) - n + 1):
        n_gram.append(' '.join(words[i:i+n]))
    return n_gram

query = 'What is the capital of France?'

cleaned_query = clean_text(query)
print(cleaned_query)

capit franc


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jules\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def score_paragraph_by_query(query, paragraph_id, json_data):
    page_title, paragraph_title, previous_h2_title = get_titles_by_id(paragraph_id, json_data)

    if paragraph_title:

        score = 0

        # Check if any word from the query is present in the titles

        #split the query
        query_words = query.lower().split()

        for word in query_words:
            if word in page_title.lower():
                score += 1  
            elif word in paragraph_title.lower():
                score += 100 # Add higher score for matching in page title
            elif previous_h2_title and word in previous_h2_title.lower():
                score += 10

        return page_title, paragraph_title, previous_h2_title, score

    return None

# Test
query_to_check = "Python programming language" 
paragraph_id_to_find = 10  
result = score_paragraph_by_query(query_to_check, paragraph_id_to_find, data)


if result:
    page_title, paragraph_title, previous_h2_title, score = result
    print(f"Page Title: {page_title}")
    print(f"Paragraph Title: {paragraph_title}")
    if previous_h2_title:
        print(f"Previous H2 Title: {previous_h2_title}")
    print(f"Score: {score}")


In [6]:
from nltk.tokenize import word_tokenize

def calculate_score(query, title):
    query_words = set(word_tokenize(clean_text(query)))
    title_words = set(word_tokenize(clean_text(title)))
    overlap = len(query_words.intersection(title_words))
    return overlap

#example
# Query tokens: ['capit', 'franc']
# Title tokens: ['capit', 'citi', 'europ', 'explor', 'pari']
# Intersection: {'capit'}
# Overlap score: 1

def rank_paragraphs_by_similarity(query, json_data):
    cleaned_query = clean_text(query)
    page_titles = []
    paragraph_titles = []
    previous_h2_titles = []
    paragraph_ids = []

    # Extract titles and IDs from JSON
    for section in json_data:
        for content_item in section["content"]:
            if content_item["type"] == "h2":
                previous_h2_titles.append(content_item["title"])
            elif content_item["type"] == "h3" and content_item["ids"]:
                paragraph_titles.append(content_item["title"])
                paragraph_ids.append(content_item["ids"])

    # Calculate word scores
    scores = {}
    for i, title in enumerate(page_titles + previous_h2_titles + paragraph_titles):
        score = calculate_score(cleaned_query, title)
        scores[paragraph_ids[i]] = score

    # Sort the dictionary by scores in descending order
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    return sorted_scores

# Example usage:
query_to_rank = 'What is the capital of France?'
ranked_paragraph_scores = rank_paragraphs_by_similarity(query_to_rank, data)

print(f"Ranked Paragraph Scores: {ranked_paragraph_scores}")


TypeError: unhashable type: 'list'