# Install Packages

In [None]:
# pip install --upgrade ir_datasets

In [None]:
# pip install python-terrier

In [None]:
# pip install ir-measures

In [None]:
# !pip install pyspellchecker

In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')

In [None]:
# !pip install gdown

In [None]:
# pip install datefinder

In [None]:
# pip install nltk sentence-transformers

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import ir_datasets
import nltk
import re
import string
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
# from spellchecker import SpellChecker
from typing import List
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
# import gdown
# from google.colab import files
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models import FastText

# Downloads Datasets

### Load Recreation Dataset && Understand dataset

In [2]:
dataset = ir_datasets.load("lotte/recreation/dev/forum")
for doc in dataset.docs_iter():
    doc # namedtuple<doc_id, text>

In [10]:
dataset.docs_count()

263025

In [11]:
docs = pd.DataFrame(dataset.docs_iter())

In [12]:
docs.head()

Unnamed: 0,doc_id,text
0,0,"Multiclassing no longer takes an XP hit, and y..."
1,1,There's a fairly large smattering of stuff. He...
2,2,"In 3rd edition this was not an official rule, ..."
3,3,The only official ruling I can recall in any c...
4,4,"I like this one, that I use on occasion: ‎Jrey..."


In [9]:
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263025 entries, 0 to 263024
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   doc_id  263025 non-null  object
 1   text    263025 non-null  object
dtypes: object(2)
memory usage: 4.0+ MB


In [4]:
queries = pd.DataFrame(dataset.queries_iter())

In [5]:
queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query_id  2002 non-null   object
 1   text      2002 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [6]:
qrels = pd.DataFrame(dataset.qrels_iter())

In [7]:
qrels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12752 entries, 0 to 12751
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query_id   12752 non-null  object
 1   doc_id     12752 non-null  object
 2   relevance  12752 non-null  int64 
 3   iteration  12752 non-null  object
dtypes: int64(1), object(3)
memory usage: 398.6+ KB


In [14]:
for doc in dataset.docs_iter()[:2]:
  print(doc)

GenericDoc(doc_id='0', text='Multiclassing no longer takes an XP hit, and your favored class gives you one of two bonuses at every level: +1 hp +1 skill point Advanced Players Guide add other options for specific Race/Class combos')
GenericDoc(doc_id='1', text='There\'s a fairly large smattering of stuff. Here\'s a taste: Races have all been powered up; +2 to two stats, -2 to one stat (including half-orcs!), with some different abilities for some. In 3.5 terms, they\'re roughly an ECL of +1 (as opposed to 0 in 3e). No one has a favored class decided by their race now - you choose your favored class, typically the one you level at 1st level, but you don\'t get an XP penalty for multiclassing too high above that one - you just gain an extra skill point or hit point for leveling your favored class. Many of the classes have had options added, often to fill out "dead levels." Barbarians gain extra "rage powers," sorcerers have bloodlines that give them granted powers, wizards have varying e

In [15]:
# check for duplicated data
print(len(docs[docs.duplicated()]))

0


In [16]:
# check for nullable value
docs.isna().sum()

doc_id    0
text      0
dtype: int64

# Data Representation - Indexing

In [14]:
class DataPreProcessing:
    custom_words = {
    'isnt', 'arent', 'im', 'id', 'ie', 'eg', 'ive', 'whatev', 'wed', 'somehow',
    'going', 'get', 'yes', 'no', 'couldnt', 'didnt', 'dont', 'doesnt', 'would',
    'could', 'should', 'cant', 'wont', 'hasnt', 'hadnt', 'havent', 'mightnt',
    'mustnt', 'neednt', 'shall', 'shant', 'werent', 'wouldnt', 'ought', 'oughtnt',
    'aint', 'gonna', 'wanna', 'whatcha', 'yall', 'ya', 'gotta', 'coulda', 'shoulda',
    'woulda', 'lotta', 'lemme', 'kinda', 'sorta', 'hafta', 'dunno', 'outta', 'alot',
    'yup', 'nope', 'nah', 'yeah', 'uh', 'um', 'uhm', 'okay', 'ok', 'yep', 'hmm',
    'mmm', 'oh', 'hey', 'hi', 'hello', 'bye', 'goodbye', 'please', 'thanks', 'thank',
    'welcome', 'etc', 'alright', 'okay', 'ok', 'gonna', 'gotta', 'wanna', 'kinda',
    'sorta', 'lemme', 'coulda', 'shoulda', 'woulda', 'whereby', 'many', 'much', 'want',
    'always'
    }
    stop_words = set(stopwords.words('english')).union(custom_words)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    # Abbreviation dictionary
    abbreviations = {
        'RPG': 'Role-Playing Game',
        'SF': 'Science Fiction',
        "AFAIK": "As Far As I Know",
        "AFK": "Away From Keyboard",
        "ASAP": "As Soon As Possible",
        "ATK": "At The Keyboard",
        "ATM": "At The Moment",
        "A3": "Anytime, Anywhere, Anyplace",
        "BAK": "Back At Keyboard",
        "BBL": "Be Back Later",
        "BBS": "Be Back Soon",
        "BFN": "Bye For Now",
        "B4N": "Bye For Now",
        "BRB": "Be Right Back",
        "BRT": "Be Right There",
        "BTW": "By The Way",
        "B4": "Before",
        "B4N": "Bye For Now",
        "CU": "See You",
        "CUL8R": "See You Later",
        "CYA": "See You",
        "FAQ": "Frequently Asked Questions",
        "FC": "Fingers Crossed",
        "FWIW": "For What It's Worth",
        "FYI": "For Your Information",
        "GAL": "Get A Life",
        "GG": "Good Game",
        "GN": "Good Night",
        "GMTA": "Great Minds Think Alike",
        "GR8": "Great!",
        "G9": "Genius",
        "IC": "I See",
        "ICQ": "I Seek you (also a chat program)",
        "ILU": "ILU: I Love You",
        "IMHO": "In My Honest/Humble Opinion",
        "IMO": "In My Opinion",
        "IOW": "In Other Words",
        "IRL": "In Real Life",
        "KISS": "Keep It Simple, Stupid",
        "LDR": "Long Distance Relationship",
        "LMAO": "Laugh My A.. Off",
        "LOL": "Laughing Out Loud",
        "LTNS": "Long Time No See",
        "L8R": "Later",
        "MTE": "My Thoughts Exactly",
        "M8": "Mate",
        "NRN": "No Reply Necessary",
        "OIC": "Oh I See",
        "PITA": "Pain In The A..",
        "PRT": "Party",
        "PRW": "Parents Are Watching",
        "QPSA?": "Que Pasa?",
        "ROFL": "Rolling On The Floor Laughing",
        "ROFLOL": "Rolling On The Floor Laughing Out Loud",
        "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
        "SK8": "Skate",
        "STATS": "Your sex and age",
        "ASL": "Age, Sex, Location",
        "THX": "Thank You",
        "TTFN": "Ta-Ta For Now!",
        "TTYL": "Talk To You Later",
        "U": "You",
        "U2": "You Too",
        "U4E": "Yours For Ever",
        "WB": "Welcome Back",
        "WTF": "What The F...",
        "WTG": "Way To Go!",
        "WUF": "Where Are You From?",
        "W8": "Wait...",
        "7K": "Sick:-D Laugher",
        "TFW": "That feeling when",
        "MFW": "My face when",
        "MRW": "My reaction when",
        "IFYP": "I feel your pain",
        "TNTL": "Trying not to laugh",
        "JK": "Just kidding",
        "IDC": "I don't care",
        "ILY": "I love you",
        "IMU": "I miss you",
        "ADIH": "Another day in hell",
        "ZZZ": "Sleeping, bored, tired",
        "WYWH": "Wish you were here",
        "TIME": "Tears in my eyes",
        "BAE": "Before anyone else",
        "FIMH": "Forever in my heart",
        "BSAAW": "Big smile and a wink",
        "BWL": "Bursting with laughter",
        "BFF": "Best friends forever",
        "CSL": "Can't stop laughing",
        'TVS': 'television',
        'TV': 'television',
        # Anime
        'OVA': 'Original Video Animation',
        'OP': 'Opening (Theme Song)',
        'ED': 'Ending (Theme Song)',
        'AMV': 'Anime Music Video',
        'CG': 'Computer Graphics',
        'MC': 'Main Character',
        'OTP': 'One True Pairing',
        'PV': 'Promotional Video',
        'BD': 'Blu-ray Disc',
        'LN': 'Light Novel',
        # Boardgames
        'BGG': 'BoardGameGeek',
        'TCG': 'Trading Card Game',
        'CCG': 'Collectible Card Game',
        'LFG': 'Looking For Group',
        'GMT': 'Greenwich Mean Time',
        'AP': 'Analysis Paralysis',
        'RPG': 'Role-Playing Game',
        'GM': 'Game Master',
        'XP': 'Experience Points',
        'VP': 'Victory Points',
        'AOE': 'Area of Effect',
        # Gaming
        'FPS': 'First-Person Shooter',
        'MMORPG': 'Massively Multiplayer Online Role-Playing Game',
        'DLC': 'Downloadable Content',
        'NPC': 'Non-Player Character',
        'PvP': 'Player versus Player',
        'PvE': 'Player versus Environment',
        'AFK': 'Away From Keyboard',
        'RTS': 'Real-Time Strategy',
        'HUD': 'Heads-Up Display',
        'RNG': 'Random Number Generator',
        'DD': 'Dungeons Dragons',
        'DMG':'Dungeon Master Guide',
        'NPCS':'Non Player Characters',
        'NPC':'Non Player Characters',
        'E': 'edition',
        '1E': '1th edition',
        '2E': '2th edition',
        '3E': '3th edition',
        '4E': '4th edition',
        '5E': '5th edition',
        '6E': '6th edition',
        '7E': '7th edition',
        '8E': '8th edition',
        'OOC':'out of character',
        'IRCD':'Internet Relay Chat daemon',
        'IRC':'Internet Relay Chat',
        'WRM':'Warrior Rogue Mage',
        'MMORPGS': 'massively multiplayer online role-playing game',
        'MMORPG': 'massively multiplayer online role-playing game',
        'FFG':'Fantasy Flight Games',
        'DDI': 'Data Design Interactive',
        'BBEG': 'Big Bad Evil End Guy Gal',
        'MM1':'Murder Mystery part 1',
        'MM2':'Murder Mystery part 2',
        'MM3':'Murder Mystery part 3',
        # Movies
        'CGI': 'Computer-Generated Imagery',
        'VFX': 'Visual Effects',
        'SFX': 'Special Effects',
        'BTS': 'Behind The Scenes',
        'POV': 'Point Of View',
        '3D': 'Three-Dimensional',
        'IMAX': 'Image Maximum',
        'DVD': 'Digital Versatile Disc',
        'OS': 'Original Soundtrack',
        'ADR': 'Automated Dialogue Replacement',
        'II': 'part ii',
        'III': 'part iii',
        'SW': 'Star Wars',
        # Photography
        'DSLR': 'Digital Single-Lens Reflex',
        'HDR': 'High Dynamic Range',
        'ISO': 'International Organization for Standardization',
        'RAW': 'Raw Image Format',
        'DOF': 'Depth Of Field',
        'AF': 'Autofocus',
        'WB': 'White Balance',
        'FPS': 'Frames Per Second',
        'EV': 'Exposure Value',
        'TTL': 'Through The Lens',
        # RPG
        'HP': 'Hit Points',
        'XP': 'Experience Points',
        'DM': 'Dungeon Master',
        'NPC': 'Non-Player Character',
        'PC': 'Player Character',
        'AC': 'Armor Class',
        'GM': 'Game Master',
        'TPK': 'Total Party Kill',
        'DC': 'Difficulty Class',

        # Sci-Fi
        'AI': 'Artificial Intelligence',
        'UFO': 'Unidentified Flying Object',
        'FTL': 'Faster Than Light',
        'VR': 'Virtual Reality',
        'AR': 'Augmented Reality',
        'SF': 'Science Fiction',
        'ET': 'Extraterrestrial',
        'BEM': 'Bug-Eyed Monster',
        'EMP': 'Electromagnetic Pulse',
        'HCI': 'Human-Computer Interaction'
        # Add more abbreviations specific to your dataset
    }

    @staticmethod
    def remove_urls(text):
        return re.sub(r'http\S+|www.\S+', '', text)

    @staticmethod
    def remove_non_english_chars(text):
        allowed_chars = string.ascii_letters + string.digits + string.punctuation + " "
        filtered_text = ''.join(char if char in allowed_chars else '' for char in text)
        return filtered_text

    @staticmethod
    def remove_punctuation(tokens):
        translator = str.maketrans({char: ' ' if char != '&' else '' for char in string.punctuation})
        # Process each token individually
        cleaned_tokens = []
        for token in tokens:
            no_punct = token.translate(translator)
            clean_token = re.sub(r'\s+', ' ', no_punct).strip()
            if clean_token:  # Avoid adding empty strings
                cleaned_tokens.append(clean_token)
        return cleaned_tokens

    @staticmethod
    def replace_abbreviation(tokens):
        new_text = []
        for i in tokens:
            if i.upper() in DataPreProcessing.abbreviations:
                new_text.append(DataPreProcessing.abbreviations[i.upper()])
            else:
                new_text.append(i)
        return new_text

    @staticmethod
    def toLowercase(text):
        return text.lower()

    @staticmethod
    def fix_repeated_chars(text):
        pattern = r'(\w)(\1{2,})'
        fixed_text = re.sub(pattern, r'\1\1', text)
        return fixed_text

    @staticmethod
    def remove_time(text):
        time_pattern = r'\b\d{1,2}:\d{2}(?::\d{2})?\b'
        text_without_time = re.sub(time_pattern, '', text)
        clean_text = re.sub(r'\s+', ' ', text_without_time).strip()
        return clean_text

    @staticmethod
    def remove_null_values(docs):
        return docs.dropna()

    @staticmethod
    def tokenize_text(text):
        return word_tokenize(text)

    @staticmethod
    def remove_stopwords(tokens):
        return [word for word in tokens if word not in DataPreProcessing.stop_words]

    @staticmethod
    def remove_duplicated_chars(tokens):
        def has_duplicated_chars(word):
            return len(set(word)) == 1

        filtered_words = [word for word in tokens if len(word) != 2 and not has_duplicated_chars(word)]
        return filtered_words

    @staticmethod
    def remove_words_start_with_duplicate_chars(text):
        # Define a regular expression pattern to match words starting with duplicate alphabetical characters
        pattern = r'\b([a-zA-Z])\1\w*\b'
        # Use re.sub to replace matched words with an empty string
        result = re.sub(pattern, '', text)
        return result

    @staticmethod
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    @staticmethod
    def lemmatize_tokens(tokens):
        pos_tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [
            DataPreProcessing.lemmatizer.lemmatize(token, DataPreProcessing.get_wordnet_pos(pos_tag))
            for token, pos_tag in pos_tagged_tokens
        ]
        return lemmatized_tokens

    @staticmethod
    def stem_tokens(tokens):
        return [DataPreProcessing.stemmer.stem(token) for token in tokens]

    @staticmethod
    def process_text(text):
        text = DataPreProcessing.remove_urls(text)
        text = DataPreProcessing.remove_non_english_chars(text)
        text = DataPreProcessing.remove_time(text)
        text = DataPreProcessing.toLowercase(text)
        tokens = DataPreProcessing.tokenize_text(text)
        tokens = DataPreProcessing.remove_punctuation(tokens)
        tokens = DataPreProcessing.replace_abbreviation(tokens)
        tokens = DataPreProcessing.remove_stopwords(tokens)
        tokens = DataPreProcessing.stem_tokens(tokens)
        return ' '.join(tokens)

    @staticmethod
    def process_text_embedding(text):
        text = DataPreProcessing.remove_urls(text)
        text = DataPreProcessing.remove_non_english_chars(text)
        text = DataPreProcessing.remove_time(text)
        text = DataPreProcessing.toLowercase(text)
        tokens = DataPreProcessing.tokenize_text(text)
        tokens = DataPreProcessing.remove_punctuation(tokens)
        tokens = DataPreProcessing.replace_abbreviation(tokens)
        tokens = DataPreProcessing.remove_stopwords(tokens)
        tokens = DataPreProcessing.stem_tokens(tokens)
        return tokens

# Word Embeddings

## Load & Train Model | Make Inverted *Index*

In [8]:
# Download Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Save Model
# model_path = 'word2vec-google-news-300.model'
# word2vec_model.save(model_path)

# Load the model from disk
# model_path = 'word2vec-google-news-300.model'
# from gensim.models import KeyedVectors
# word2vec_model = KeyedVectors.load(model_path)

In [9]:
docs['tokens'] = docs['text'].apply(DataPreProcessing.process_text_embedding)
queries['tokens'] = queries['text'].apply(DataPreProcessing.process_text_embedding)

In [None]:
docs = pd.read_csv("docs.csv")
docs.iloc[0]

In [11]:
import ast
# Convert string representations of lists back to lists
docs['tokens'] = docs['tokens'].apply(ast.literal_eval)
# queries['tokens'] = queries['tokens'].apply(ast.literal_eval)

# Ensure tokens are lists
assert all(isinstance(tokens, list) for tokens in docs['tokens'])
assert all(isinstance(tokens, list) for tokens in queries['tokens'])

In [12]:
# Train Word2Vec model
word_embedding_model = Word2Vec(
    sentences=docs['tokens'].tolist() + queries['tokens'].tolist(),
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [13]:
# Function to convert document to vector
def document_to_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [14]:
# Allocate space for document and query vectors
docs_matrix = np.zeros((len(docs), word_embedding_model.vector_size))
queries_matrix = np.zeros((len(queries), word_embedding_model.vector_size))

# Convert documents to vectors
for idx, tokens in enumerate(docs['tokens']):
    docs_matrix[idx] = document_to_vector(tokens, word_embedding_model)

# Convert queries to vectors
for idx, tokens in enumerate(queries['tokens']):
    queries_matrix[idx] = document_to_vector(tokens, word_embedding_model)

## Matching & Ranking | Word2Vec

### Consine Similarity | Word Embeddings

In [15]:
# Compute cosine similarity
similarity_embedding = cosine_similarity(queries_matrix, docs_matrix)
df_similarity = pd.DataFrame(similarity_embedding)

In [16]:
# create inverted index
inverted_index = {docs['doc_id'].iloc[idx]: vector for idx, vector in enumerate(docs_matrix)}


In [17]:
# Print some values in the inverted index
for i, (doc_id, vector) in enumerate(inverted_index.items()):
    if i < 5:  # Limit to first 5 entries for brevity
        print(f"doc_id: {doc_id}")
        print(f"vector: {vector}\n")
    else:
        break

doc_id: 0
vector: [-0.26137078 -0.69748771 -0.68483663  0.61565238  0.36146051 -0.42348596
  1.56465709  0.3188625   0.72279418  0.52409828 -0.70906574 -0.9220174
  0.81480372 -0.72847378 -0.24936849 -0.10292105  0.57334906 -0.873362
 -0.11321107  0.32468319  1.00031483  0.64244837 -0.60596204 -1.14809585
 -1.25782192  0.20092385  0.29802105  0.075483    0.52591258 -0.26147965
  0.15736106 -0.00952683 -0.54743779  0.25573874  0.67376667 -1.2593168
 -0.07236543  0.82514381  0.61321944  0.90469331  1.00444269  0.20772038
  0.35586116 -0.25942275  0.142946    0.2744202   0.32818496 -0.93333113
  0.03023459  0.01003836  0.33311313  0.21136914  0.5043816  -0.05756157
  1.28194487 -0.23078762 -0.52550155  1.19302917  1.23045754 -0.23840393
 -0.77722639  0.65894085  0.36481604  0.59284532 -1.1120162  -0.30684897
 -0.36647782 -0.0043644   0.36001    -0.54315263 -0.33327273  1.03710115
  1.0733279   1.60768199  0.04802155 -1.73614204 -0.52946776  0.06494557
 -0.42969003  0.21345544 -0.3029505  

### Calculate Top 10 Similarity

In [18]:
# Function to get the top 10 similarities for each query
def get_top_n_similarities(siml_df, n=10):
    top_n_df = pd.DataFrame()
    for query_index in siml_df.index:
        top_n_similarities = siml_df.loc[query_index].nlargest(n)
        top_n_df_query = pd.DataFrame({
            'query_id': query_index,
            'doc_id': top_n_similarities.index,
            'similarity': top_n_similarities.values
        })
        top_n_df = pd.concat([top_n_df, top_n_df_query], ignore_index=True)
    return top_n_df

top_10_similarities = get_top_n_similarities(df_similarity, n=10)

In [19]:
top_10_similarities.iloc[:30]

Unnamed: 0,query_id,doc_id,similarity
0,0,123523,0.921746
1,0,129501,0.921688
2,0,147234,0.914587
3,0,135460,0.908288
4,0,111474,0.90781
5,0,111246,0.90765
6,0,136202,0.893896
7,0,147805,0.892568
8,0,198553,0.891644
9,0,183494,0.891386


In [20]:
qrels.iloc[0:10]

Unnamed: 0,query_id,doc_id,relevance,iteration
0,0,130975,1,0
1,0,130977,1,0
2,0,130982,1,0
3,0,130991,1,0
4,0,130997,1,0
5,0,130998,1,0
6,0,131019,1,0
7,0,131021,1,0
8,0,131026,1,0
9,0,135480,1,0


In [21]:
# Function to get top N similar documents for a given query
def get_top_n_similar_docs(query_id, top_n=10):
    query_tokens = queries[queries['query_id'] == query_id]['tokens'].values[0]
    query_vector = document_to_vector(query_tokens, word_embedding_model)
    similarities = cosine_similarity([query_vector], docs_matrix)[0]
    top_n_indices = similarities.argsort()[-top_n:][::-1]
    return docs.iloc[top_n_indices]

# Example usage
query_id = '0'  # Replace with an actual query_id from your data
top_docs = get_top_n_similar_docs(query_id)
top_docs

Unnamed: 0,doc_id,text,tokens
123523,123523,Sauron knew that Gollum would pursue the ring ...,"[sauron, knew, gollum, pursue, ring, relentles..."
129501,129501,The One Ring was created with the intent to co...,"[one, ring, created, intent, control, everyone..."
147234,147234,Saruman was definitely planning to use the Rin...,"[saruman, definitely, planning, use, ring, dou..."
135460,135460,"Sauron was the one closest to the One Ring, si...","[sauron, one, closest, one, ring, since, put, ..."
111474,111474,I'll reference a few of the same passages as a...,"[reference, passages, another, answer, draw, d..."
111246,111246,Clearly Saruman considered himself equal to Sa...,"[clearly, saruman, considered, equal, sauron, ..."
136202,136202,Letter 246 describes a hypothetical confrontat...,"[letter, 246, describes, hypothetical, confron..."
147805,147805,"As far as we know, Sauron cannot detect the pr...","[far, know, sauron, detect, presence, ring, ev..."
198553,198553,It is likely that Sauron's purpose in creating...,"[likely, sauron, purpose, creating, ruling, ri..."
183494,183494,Because Sauron crafted the One Ring to exert p...,"[sauron, crafted, one, ring, exert, power, rin..."


### Calculate Precision & Recall @ 10

In [22]:
def precision_recall_at_10(relevant_docs, retrieved_docs):
    k = 10
    # Ensure we do not exceed the length of the retrieved_docs list
    retrieved_k = retrieved_docs[:k]
    # Calculate the number of relevant and retrieved documents
    relevant_and_retrieved = len(set(retrieved_k) & set(relevant_docs))
    # Precision: proportion of retrieved documents that are relevant
    precision = relevant_and_retrieved / k
    # Recall: proportion of relevant documents that are retrieved
    recall = relevant_and_retrieved / len(relevant_docs)
    return precision, recall

In [23]:
# Ensure the data types of query_id columns are consistent
top_10_similarities['query_id'] = top_10_similarities['query_id'].astype(int)
qrels['query_id'] = qrels['query_id'].astype(int)
# Ensure the data types of doc_id columns are consistent
top_10_similarities['doc_id'] = top_10_similarities['doc_id'].astype(int)
qrels['doc_id'] = qrels['doc_id'].astype(int)

In [24]:
query_ids = top_10_similarities['query_id'].unique()

# Iterate over each query_id and calculate precision and recall at 10
for query_id in query_ids:
    # Get the top 10 retrieved docs for this query
    retrieved_docs = top_10_similarities[top_10_similarities['query_id'] == query_id]['doc_id'].tolist()

    # Get the relevant docs for this query
    relevant_docs = qrels[(qrels['query_id'] == query_id) & (qrels['relevance'] == 1)]['doc_id'].tolist()

    # Debugging: Print the retrieved_docs and relevant_docs lists to ensure they are correct
    if(query_id < 20):
      print(f"Query ID: {query_id}")
      print(f"Retrieved Docs: {retrieved_docs}")
      print(f"Relevant Docs: {relevant_docs}")

    # Check if relevant_docs is empty
    if not relevant_docs:
        print(f"No relevant documents found for query_id {query_id}")
        continue

    # Calculate precision and recall at 10
    precision, recall = precision_recall_at_10(relevant_docs, retrieved_docs)

    # Print the results
    if(query_id < 20):
      print(f"Precision at 10: {precision}")
      print(f"Recall at 10: {recall}")
      print("--------------------------------------------------------------------------")

Query ID: 0
Retrieved Docs: [123523, 129501, 147234, 135460, 111474, 111246, 136202, 147805, 198553, 183494]
Relevant Docs: [130975, 130977, 130982, 130991, 130997, 130998, 131019, 131021, 131026, 135480, 139194, 143870, 145486, 182816]
Precision at 10: 0.0
Recall at 10: 0.0
--------------------------------------------------------------------------
Query ID: 1
Retrieved Docs: [118503, 151679, 143715, 6591, 121537, 170482, 168754, 118177, 155881, 12281]
Relevant Docs: [152301, 152331, 152343, 152442, 169727]
Precision at 10: 0.0
Recall at 10: 0.0
--------------------------------------------------------------------------
Query ID: 2
Retrieved Docs: [164706, 140785, 155282, 129872, 109890, 122360, 210636, 115822, 152880, 176593]
Relevant Docs: [109883, 109890, 111423, 111459, 123072, 123190, 123866, 126082, 130382, 136722, 143233, 143357, 147092, 164706, 176593]
Precision at 10: 0.3
Recall at 10: 0.2
--------------------------------------------------------------------------
Query ID: 3
Re

In [25]:
def calculate_map_at_k(top_10_similarities: pd.DataFrame, qrels: pd.DataFrame, k: int = 10) -> float:
    # Ensure correct data types
    top_10_similarities = top_10_similarities.astype({"query_id": int, "doc_id": int, "similarity": float})
    qrels = qrels.astype({"query_id": int, "doc_id": int, "relevance": int})

    # Parse qrels to create a dictionary of relevant documents for each query
    qrels_dict = {}
    for entry in qrels.itertuples(index=False):
        query_id = entry.query_id
        doc_id = entry.doc_id
        if query_id not in qrels_dict:
            qrels_dict[query_id] = []
        qrels_dict[query_id].append(doc_id)

    # Parse top_similarity to create a list of predicted documents for each query
    predicted_dict = {}
    for entry in top_10_similarities.itertuples(index=False):
        query_id = entry.query_id
        doc_id = entry.doc_id
        if query_id not in predicted_dict:
            predicted_dict[query_id] = []
        predicted_dict[query_id].append(doc_id)

    # Initialize variables
    Q = len(qrels_dict)  # number of queries
    ap = []

    # Calculate AP for each query
    for q in qrels_dict:
        actual = qrels_dict[q]
        predicted = predicted_dict.get(q, [])
        ap_num = 0
        rel_count = 0

        for x in range(min(k, len(predicted))):
            if predicted[x] in actual:
                rel_count += 1
                precision_at_k = rel_count / (x + 1)
                ap_num += precision_at_k

        if len(actual) > 0:
            ap_q = ap_num / len(actual)
            ap.append(ap_q)

    # Calculate MAP
    map_at_k = sum(ap) / Q
    return round(map_at_k, 4)
    # return map_at_k

# Example usage:
k = 10
map_at_k = calculate_map_at_k(top_10_similarities, qrels, k = 10)
print(f"MAP@{k} = {map_at_k}")

MAP@10 = 0.035


## Test Word2Vec

In [39]:
queries['text'].iloc[0]

'Would the One Ring even work for anyone but Sauron?'

In [40]:
queries['tokens'].iloc[0]

['one', 'ring', 'even', 'work', 'anyon', 'sauron']

In [41]:
docs['text'].iloc[130975] #true

'The key quote from Tolkien that answers this is contained in Letter 246, with my added emphasis: It was part of the essential deceit of the Ring to fill minds with imaginations of supreme power. This implies that the Ring is useless to anyone but Sauron, but it tempts you into thinking it\'s a source of power, and in that way it gets you to wear it, and so gains control over you. In order to proceed in an investigation of this, and how to reconcile it with other statements made by Tolkien we must first establish a baseline that we\'re going to work from: It is accepted that the Ring gave some degree of "power" to Isildur, Gollum, Bilbo, Frodo and Sam, in the form of invisibility, longevity and an occasionally enhanced presence. Sauron was the maker of the Ring; he originally made it for his own use, and his own use only; he certainly never intended it to be used by anybody else, and he had no requirement for any of these three (as he already possessed them due to his nature as a Maia)

In [43]:
text = DataPreProcessing.process_text(docs['text'].iloc[130975])
text

'key quot tolkien answer contain letter 246 ad emphasi part essenti deceit ring fill mind imagin suprem power impli ring useless anyon sauron tempt think sourc power way get wear gain control order proceed investig reconcil statement made tolkien must first establish baselin work accept ring gave degre power isildur gollum bilbo frodo sam form invis longev occasion enhanc presenc sauron maker ring origin made use use certainli never intend use anybodi els requir three alreadi possess due natur maia therefor three power intrins properti ring intend maker rather side effect use mortal power actual quit minor compar possess maia differ kind power minor comparison power one kind avail mortal ring bear power rais armi control other defeat sauron rule middle earth complet differ kind altogeth let look tolkien say ring quot sourc shamshiel answer even wear power exist rapport diminish unless seiz becam possess happen new possessor suffici strong heroic natur challeng sauron becom master learn

In [44]:
docs['text'].iloc[123523] # False

'Sauron knew that Gollum would pursue the ring relentlessly, and that this would effectively function as a free service for Sauron. Once Gollum had the Ring it would only be a matter of time until Sauron got it off him as he did not have the strength to bend the ring to his will. Here Sauron was just playing the long game. Also, anyone attempting to use the ring to oppose Sauron would also have the loose cannon of Gollum to contend with which would, hopefully, make their job more difficult. Although this does kind of backfire, but only because of special items gifted by the elves otherwise Frodo would have been eaten by a spider.'

In [45]:
text = DataPreProcessing.process_text(docs['text'].iloc[123523])
text

'sauron knew gollum pursu ring relentlessli effect function free servic sauron gollum ring matter tears in my ey sauron got strength bend ring sauron play long game also anyon attempt use ring oppos sauron also loos cannon gollum contend hope make job difficult although kind backfir special item gift elv otherwis frodo eaten spider'

In [46]:
queries['tokens'].iloc[0]

['one', 'ring', 'even', 'work', 'anyon', 'sauron']