Inverted Index:

a. Tokenisation:

In [42]:
import re

# Declare all_tokens as a global variable
all_tokens = []  # List to hold all tokens from all queries

# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Main function for tokenization
def main():
    global all_tokens  # Declare all_tokens as global to access it throughout the notebook
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    print("\nTOKENIZATION:\n")
    for query in queries:
        query_id = query.get('query_id', 'C001')  # Default to 'C001' for the first query if missing
        title = query.get('title', '')  # Fetch title or empty string
        description = query.get('description', '')  # Fetch description or empty string
        text = title + " " + description  # Concatenate title and description for tokenization
        tokens = tokenize(text)
        
        print(f"Tokens for {query_id}: {tokens}")  # Print tokens for each query

        all_tokens.extend(tokens)  # Add tokens to the all_tokens list

    print(f"\nAll Tokens from All Queries: {all_tokens}")  # Print all tokens

if __name__ == "__main__":
    main()



TOKENIZATION:

Tokens for C001: ['qaabdhismeedka', 'berlin', 'hel', 'dokumintiyada', 'qaabdhismeedka', 'berlin']
Tokens for C002: ['aragtida', 'korontada', 'liidata', 'hel', 'dokumintiyada', 'soo', 'wargeliyaa', 'ogaanshaha', 'hadeer', 'ee', 'goobta', 'subnuclear', 'physics', 'ee', 'xaqiijiyaa', 'aragtida', 'korontada', 'liidata', 'ee', 'la', 'mideeyay', 'ee', 'weinberg', 'salam', 'glashow']
Tokens for C003: ['daroogada', 'holland', 'waa', 'maxay', 'xeerka', 'daroogada', 'ee', 'netherlands']
Tokens for C004: ['fatahaadaha', 'yurub', 'hel', 'dokumintiyada', 'bixiyaa', 'qiyaasaha', 'qarashaadka', 'dhaqaalaha', 'ee', 'waxyeelada', 'beeraha', 'ee', 'ay', 'sababaan', 'fatahaada', 'yurub']
Tokens for C005: ['xubinnimada', 'ururka', 'yurubiyaanka', 'aqoonso', 'dabeecadaha', 'wadamada', 'aan', 'xubinta', 'aheyn', 'ee', 'dhinaca', 'ku', 'biirista', 'bulshada', 'yurubta', 'ama', 'ururka', 'yurubka']
Tokens for C006: ['diidayaasha', 'ka', 'go', 'antahay', 'ee', 'faransiiska', 'shaqooyinkee', 'wa

b. Stop word removal:

In [30]:
import re

# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Function to remove stop words
def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

# Main function for stop word removal
def main():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Define stop words
    stop_words = {'the', 'is', 'at', 'which', 'on', 'and', 'to', 'in', 'a'}  # Add more stop words as needed

    print("\nSTOP WORD REMOVAL:\n")
    for query in queries:
        query_id = query.get('query_id', 'C001')  # Default to 'C001' for the first query if missing
        title = query.get('title', '')  # Fetch title or empty string
        description = query.get('description', '')  # Fetch description or empty string
        text = title + " " + description  # Concatenate title and description for tokenization
        tokens = tokenize(text)
        
        # Remove stop words
        filtered_tokens = remove_stop_words(tokens, stop_words)

        print(f"Tokens for {query_id} after stop word removal: {filtered_tokens}")  # Print tokens after stop word removal

if __name__ == "__main__":
    main()



STOP WORD REMOVAL:

Tokens for C001 after stop word removal: ['qaabdhismeedka', 'berlin', 'hel', 'dokumintiyada', 'qaabdhismeedka', 'berlin']
Tokens for C002 after stop word removal: ['aragtida', 'korontada', 'liidata', 'hel', 'dokumintiyada', 'soo', 'wargeliyaa', 'ogaanshaha', 'hadeer', 'ee', 'goobta', 'subnuclear', 'physics', 'ee', 'xaqiijiyaa', 'aragtida', 'korontada', 'liidata', 'ee', 'la', 'mideeyay', 'ee', 'weinberg', 'salam', 'glashow']
Tokens for C003 after stop word removal: ['daroogada', 'holland', 'waa', 'maxay', 'xeerka', 'daroogada', 'ee', 'netherlands']
Tokens for C004 after stop word removal: ['fatahaadaha', 'yurub', 'hel', 'dokumintiyada', 'bixiyaa', 'qiyaasaha', 'qarashaadka', 'dhaqaalaha', 'ee', 'waxyeelada', 'beeraha', 'ee', 'ay', 'sababaan', 'fatahaada', 'yurub']
Tokens for C005 after stop word removal: ['xubinnimada', 'ururka', 'yurubiyaanka', 'aqoonso', 'dabeecadaha', 'wadamada', 'aan', 'xubinta', 'aheyn', 'ee', 'dhinaca', 'ku', 'biirista', 'bulshada', 'yurubta',

c. Inverted Index Construction:

In [48]:
import re
from collections import defaultdict

# Declare inverted_index as a global variable
inverted_index = defaultdict(set)  # Dictionary to hold terms and corresponding query_ids

# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Function to build inverted index
def build_inverted_index(queries):
    global inverted_index  # Declare inverted_index as global to access it throughout the notebook
    default_query_id = 'C001'  # Default query_id for the first query

    for query in queries:
        query_id = query.get('query_id', default_query_id)  # Fetch query_id or 'C001' for the first one
        title = query.get('title', '')  # Fetch title or empty string
        description = query.get('description', '')  # Fetch description or empty string
        text = title + " " + description  # Concatenate title and description for tokenization
        tokens = tokenize(text)

        for token in tokens:
            if token:  # Ensure token is not empty or None
                inverted_index[token].add(query_id)

# Function to print inverted index
def print_inverted_index(inverted_index):
    for term, query_ids in inverted_index.items():
        print(f"'{term}': {sorted(query_ids)}")

# Main function
def main():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Step 1: Tokenization (already handled separately)

    # Step 3: Inverted Index Construction
    print("\nINVERTED INDEX CONSTRUCTION:\n")
    build_inverted_index(queries)
    print_inverted_index(inverted_index)

if __name__ == "__main__":
    main()



INVERTED INDEX CONSTRUCTION:

'qaabdhismeedka': ['C001']
'berlin': ['C001', 'C142']
'hel': ['C001', 'C002', 'C004', 'C007', 'C008', 'C011', 'C012', 'C016', 'C018', 'C019', 'C021', 'C031', 'C032', 'C033', 'C034', 'C037', 'C039', 'C041', 'C042', 'C043', 'C045', 'C050', 'C051', 'C052', 'C054', 'C055', 'C056', 'C057', 'C059', 'C060', 'C061', 'C062', 'C063', 'C064', 'C065', 'C066', 'C067', 'C068', 'C070', 'C071', 'C074', 'C075', 'C079', 'C081', 'C082', 'C083', 'C085', 'C086', 'C087', 'C088', 'C089', 'C093', 'C094', 'C095', 'C097', 'C099', 'C100', 'C102', 'C103', 'C105', 'C106', 'C111', 'C113', 'C114', 'C116', 'C119', 'C122', 'C123', 'C124', 'C126', 'C128', 'C131', 'C132', 'C133', 'C134', 'C137', 'C138', 'C139', 'C141', 'C142', 'C145', 'C147', 'C149', 'C150', 'C152', 'C153', 'C154', 'C158', 'C161', 'C162', 'C163', 'C167', 'C169', 'C170', 'C173', 'C174', 'C175', 'C176', 'C178', 'C181', 'C182', 'C184', 'C187', 'C188', 'C190', 'C193', 'C194', 'C195', 'C196', 'C197', 'C198', 'C199', 'C200']
'do

Positional Index:

In [49]:
import re
from collections import defaultdict

# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Function to build inverted index with positional indexing
def build_inverted_index(queries):
    inverted_index = defaultdict(lambda: defaultdict(list))  # Nested dictionary for terms and their positions

    for query in queries:
        query_id = query.get('query_id', 'C001')  # Use 'C001' as the default query_id
        title = query.get('title', '')  # Fetch title or empty string
        description = query.get('description', '')  # Fetch description or empty string
        text = title + " " + description  # Concatenate title and description for tokenization
        tokens = tokenize(text)

        for position, token in enumerate(tokens):
            if token:  # Ensure token is not empty
                inverted_index[token][query_id].append(position + 1)  # Store position (1-based index)

    return inverted_index

# Function to print inverted index
def print_inverted_index(inverted_index):
    for term, query_positions in inverted_index.items():
        positions_str = []
        for query_id, positions in query_positions.items():
            positions_str.append(f"{query_id}: {', '.join(map(str, positions))}")

        print(f"'{term}': {'; '.join(positions_str)}\n")  # Add line spacing

# Main function
def main():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Step 1: Tokenization (already handled separately)

    # Step 3: Inverted Index Construction
    print("\nPOSITIONAL INDEX CONSTRUCTION:\n")
    inverted_index = build_inverted_index(queries)
    print_inverted_index(inverted_index)

if __name__ == "__main__":
    main()



POSITIONAL INDEX CONSTRUCTION:

'qaabdhismeedka': C001: 1, 5

'berlin': C001: 2, 6; C142: 12

'hel': C001: 3; C002: 4; C004: 3; C007: 6; C008: 4; C011: 6; C012: 3; C016: 3; C018: 4; C019: 3; C021: 4; C031: 5; C032: 3; C033: 3; C034: 5; C037: 3; C039: 5; C041: 6; C042: 7; C043: 5; C045: 5; C050: 3; C051: 5; C052: 6; C054: 5; C055: 5; C056: 8; C057: 4; C059: 3; C060: 4; C061: 4; C062: 4; C063: 3; C064: 4; C065: 4; C066: 8; C067: 4; C068: 6; C070: 5; C071: 5; C074: 5; C075: 4; C079: 4; C081: 4; C082: 5; C083: 5; C085: 5; C086: 5; C087: 6; C088: 6; C089: 4; C093: 2; C094: 4; C095: 3; C097: 5; C099: 3; C100: 4; C102: 4; C103: 4; C105: 3; C106: 4; C111: 4; C113: 3; C114: 5; C116: 4; C119: 5; C122: 5; C123: 4; C124: 9; C126: 7; C128: 6; C131: 4; C132: 3; C133: 5; C134: 4; C137: 5; C138: 7; C139: 6; C141: 5; C142: 6; C145: 5; C147: 5; C149: 6; C150: 7; C152: 3; C153: 5; C154: 6; C158: 6; C161: 3; C162: 6; C163: 4; C167: 4; C169: 4; C170: 7; C173: 4; C174: 6; C175: 5; C176: 7; C178: 4; C181: 4

SOUNDEX ALGORITHM:

In [50]:
# Function to compute Soundex code
def soundex(name):
    # Soundex encoding rules
    soundex_mapping = {
        'B': '1', 'F': '1', 'P': '1', 'V': '1',
        'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2',
        'D': '3', 'T': '3',
        'L': '4',
        'M': '5', 'N': '5',
        'R': '6'
    }

    # Step 1: Keep the first letter of the name
    first_letter = name[0].upper()
    # Step 2: Replace letters with corresponding numbers
    name = name.upper()[1:]  # Remove the first letter for processing
    encoded = first_letter  # Start with the first letter

    # Step 3: Iterate over each letter in the name
    previous_code = ''
    for char in name:
        if char in soundex_mapping:
            code = soundex_mapping[char]
            # Avoid duplicates
            if code != previous_code:
                encoded += code
            previous_code = code
        else:
            previous_code = ''  # Reset if char is not in mapping

    # Step 4: Replace with zeros to make the code length 4
    encoded = encoded.ljust(4, '0')[:4]

    return encoded

# Main function for Soundex
def main():
    soundex_codes = {token: soundex(token) for token in all_tokens}

    # Print the Soundex codes for all tokens
    print("\nSOUNDEX CODES:\n")
    for token, code in soundex_codes.items():
        print(f"{token}: {code}")

if __name__ == "__main__":
    main()



SOUNDEX CODES:

qaabdhismeedka: Q132
berlin: B645
hel: H400
dokumintiyada: D255
aragtida: A623
korontada: K653
liidata: L330
soo: S000
wargeliyaa: W624
ogaanshaha: O252
hadeer: H360
ee: E000
goobta: G130
subnuclear: S152
physics: P220
xaqiijiyaa: X220
la: L000
mideeyay: M300
weinberg: W516
salam: S450
glashow: G420
daroogada: D623
holland: H453
waa: W000
maxay: M200
xeerka: X620
netherlands: N364
fatahaadaha: F330
yurub: Y610
bixiyaa: B200
qiyaasaha: Q200
qarashaadka: Q623
dhaqaalaha: D240
waxyeelada: W243
beeraha: B600
ay: A000
sababaan: S115
fatahaada: F330
xubinnimada: X155
ururka: U662
yurubiyaanka: Y615
aqoonso: A252
dabeecadaha: D123
wadamada: W353
aan: A500
xubinta: X153
aheyn: A500
dhinaca: D520
ku: K000
biirista: B623
bulshada: B423
yurubta: Y613
ama: A500
yurubka: Y612
diidayaasha: D320
ka: K000
go: G000
antahay: A530
faransiiska: F652
shaqooyinkee: S252
waxaa: W200
siiyay: S000
inta: I530
lagu: L200
jiray: J600
adeega: A320
qarankooda: Q652
isticmaalka: I232
iyo: I000
kubad

MISPELLED WORD USING EDIT DISTANCE:

In [51]:
import numpy as np

def levenshtein_distance(word1, word2):
    dp = np.zeros((len(word1) + 1, len(word2) + 1), dtype=int)
    for i in range(len(word1) + 1):
        for j in range(len(word2) + 1):
            if i == 0:
                dp[i][j] = j  
            elif j == 0:
                dp[i][j] = i  
            elif word1[i - 1] == word2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])  
    
    return dp[len(word1)][len(word2)]

def suggest_correct_word(misspelled_word, index):
    min_distance = float('inf')
    correct_word = ""
    for word in index.keys():
        distance = levenshtein_distance(misspelled_word, word)
        if distance < min_distance:
            min_distance = distance
            correct_word = word
    return correct_word
suggested_word = suggest_correct_word("aragtide", inverted_index)
print("Corrected word for 'aragtide':", suggested_word)

Corrected word for 'aragtide': aragtida


N-GRAMS:

 BLOCK SORT BASED INDEXING:

In [53]:
import re
from collections import defaultdict

# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Block Sort-based Indexing
def block_sort_based_indexing(queries, block_size):
    inverted_index = defaultdict(set)  # Dictionary to hold terms and corresponding query_ids
    default_query_id = 'C001'  # Default query_id for the first query

    for i in range(0, len(queries), block_size):
        block = queries[i:i + block_size]
        block_inverted_index = defaultdict(set)

        for query in block:
            query_id = query.get('query_id', default_query_id)
            title = query.get('title', '')
            description = query.get('description', '')
            text = title + " " + description
            tokens = tokenize(text)

            for token in tokens:
                if token:  # Ensure token is not empty or None
                    block_inverted_index[token].add(query_id)

        # Merge block inverted index into the main inverted index
        for term, query_ids in block_inverted_index.items():
            inverted_index[term].update(query_ids)

    return inverted_index

# Function to print inverted index
def print_inverted_index(inverted_index):
    for term, query_ids in inverted_index.items():
        print(f"'{term}': {sorted(query_ids)}")

# Main function for Block Sort-based Indexing
def main_block_sort():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Block size for indexing
    block_size = 2  # Adjust block size as needed
    print("\nBLOCK SORT-BASED INDEXING:\n")
    inverted_index = block_sort_based_indexing(queries, block_size)
    print_inverted_index(inverted_index)

if __name__ == "__main__":
    main_block_sort()



BLOCK SORT-BASED INDEXING:

'qaabdhismeedka': ['C001']
'berlin': ['C001', 'C142']
'hel': ['C001', 'C002', 'C004', 'C007', 'C008', 'C011', 'C012', 'C016', 'C018', 'C019', 'C021', 'C031', 'C032', 'C033', 'C034', 'C037', 'C039', 'C041', 'C042', 'C043', 'C045', 'C050', 'C051', 'C052', 'C054', 'C055', 'C056', 'C057', 'C059', 'C060', 'C061', 'C062', 'C063', 'C064', 'C065', 'C066', 'C067', 'C068', 'C070', 'C071', 'C074', 'C075', 'C079', 'C081', 'C082', 'C083', 'C085', 'C086', 'C087', 'C088', 'C089', 'C093', 'C094', 'C095', 'C097', 'C099', 'C100', 'C102', 'C103', 'C105', 'C106', 'C111', 'C113', 'C114', 'C116', 'C119', 'C122', 'C123', 'C124', 'C126', 'C128', 'C131', 'C132', 'C133', 'C134', 'C137', 'C138', 'C139', 'C141', 'C142', 'C145', 'C147', 'C149', 'C150', 'C152', 'C153', 'C154', 'C158', 'C161', 'C162', 'C163', 'C167', 'C169', 'C170', 'C173', 'C174', 'C175', 'C176', 'C178', 'C181', 'C182', 'C184', 'C187', 'C188', 'C190', 'C193', 'C194', 'C195', 'C196', 'C197', 'C198', 'C199', 'C200']
'doku

SINGLE PASS IN MEMORY INDEXING:

In [54]:
# Single Pass In-Memory Indexing
def single_pass_in_memory_indexing(queries):
    inverted_index = defaultdict(set)
    default_query_id = 'C001'

    for query in queries:
        query_id = query.get('query_id', default_query_id)
        title = query.get('title', '')
        description = query.get('description', '')
        text = title + " " + description
        tokens = tokenize(text)

        for token in tokens:
            if token:  # Ensure token is not empty or None
                inverted_index[token].add(query_id)

    return inverted_index

# Main function for Single Pass In-Memory Indexing
def main_single_pass():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    print("\nSINGLE PASS IN-MEMORY INDEXING:\n")
    inverted_index = single_pass_in_memory_indexing(queries)
    print_inverted_index(inverted_index)

if __name__ == "__main__":
    main_single_pass()



SINGLE PASS IN-MEMORY INDEXING:

'qaabdhismeedka': ['C001']
'berlin': ['C001', 'C142']
'hel': ['C001', 'C002', 'C004', 'C007', 'C008', 'C011', 'C012', 'C016', 'C018', 'C019', 'C021', 'C031', 'C032', 'C033', 'C034', 'C037', 'C039', 'C041', 'C042', 'C043', 'C045', 'C050', 'C051', 'C052', 'C054', 'C055', 'C056', 'C057', 'C059', 'C060', 'C061', 'C062', 'C063', 'C064', 'C065', 'C066', 'C067', 'C068', 'C070', 'C071', 'C074', 'C075', 'C079', 'C081', 'C082', 'C083', 'C085', 'C086', 'C087', 'C088', 'C089', 'C093', 'C094', 'C095', 'C097', 'C099', 'C100', 'C102', 'C103', 'C105', 'C106', 'C111', 'C113', 'C114', 'C116', 'C119', 'C122', 'C123', 'C124', 'C126', 'C128', 'C131', 'C132', 'C133', 'C134', 'C137', 'C138', 'C139', 'C141', 'C142', 'C145', 'C147', 'C149', 'C150', 'C152', 'C153', 'C154', 'C158', 'C161', 'C162', 'C163', 'C167', 'C169', 'C170', 'C173', 'C174', 'C175', 'C176', 'C178', 'C181', 'C182', 'C184', 'C187', 'C188', 'C190', 'C193', 'C194', 'C195', 'C196', 'C197', 'C198', 'C199', 'C200']


COMPARISION BETWEEN BSBI AND SPIMI:

In [55]:
import time

# Compare both indexing methods
def compare_indexing_methods():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Measure Block Sort-based Indexing time
    start_time = time.time()
    block_index = block_sort_based_indexing(queries, block_size=2)
    block_duration = time.time() - start_time
    print(f"Block Sort-based Indexing Duration: {block_duration:.6f} seconds")

    # Measure Single Pass In-Memory Indexing time
    start_time = time.time()
    single_pass_index = single_pass_in_memory_indexing(queries)
    single_pass_duration = time.time() - start_time
    print(f"Single Pass In-Memory Indexing Duration: {single_pass_duration:.6f} seconds")

# Run comparison
if __name__ == "__main__":
    compare_indexing_methods()


Block Sort-based Indexing Duration: 0.004287 seconds
Single Pass In-Memory Indexing Duration: 0.001994 seconds


MAP:

In [61]:
import re

mapped_results = []
# Function to read queries from the .txt file
def read_queries_from_file(file_path):
    queries = []
    query = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines

            if line.startswith('query_id:'):
                if query:  # If we already have a query in progress, append it to the list
                    queries.append(query)
                query = {'query_id': line.split(':', 1)[1].strip()}  # Start a new query with query_id
            elif line.startswith('title:'):
                query['title'] = line.split(':', 1)[1].strip()  # Add title to query
            elif line.startswith('description:'):
                query['description'] = line.split(':', 1)[1].strip()  # Add description to query

        if query:  # Append the last query if it's not already added
            queries.append(query)

    return queries

# Function to tokenize text
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text into words
    return tokens

# Map Function
def map_function(queries):
      # List to hold the mapped results
    for query in queries:
        query_id = query.get('query_id', 'C001')  # Get query_id or default to 'C001'
        title = query.get('title', '')
        description = query.get('description', '')
        text = title + " " + description
        tokens = tokenize(text)

        # Generate mapping tuples
        for token in tokens:
            if token:  # Ensure token is not empty
                mapped_results.append((token, query_id))  # Append (term, query_id) tuple

    return mapped_results

# Main function to demonstrate mapping
def main_map():
    file_path = 'queries.txt'  # Update this with the correct file path
    queries = read_queries_from_file(file_path)

    # Call the map function
    mapped_results = map_function(queries)
    print("Mapped Results:")
    for result in mapped_results:
        print(result)  # Output the mapped results

if __name__ == "__main__":
    main_map()


Mapped Results:
('qaabdhismeedka', 'C001')
('berlin', 'C001')
('hel', 'C001')
('dokumintiyada', 'C001')
('qaabdhismeedka', 'C001')
('berlin', 'C001')
('aragtida', 'C002')
('korontada', 'C002')
('liidata', 'C002')
('hel', 'C002')
('dokumintiyada', 'C002')
('soo', 'C002')
('wargeliyaa', 'C002')
('ogaanshaha', 'C002')
('hadeer', 'C002')
('ee', 'C002')
('goobta', 'C002')
('subnuclear', 'C002')
('physics', 'C002')
('ee', 'C002')
('xaqiijiyaa', 'C002')
('aragtida', 'C002')
('korontada', 'C002')
('liidata', 'C002')
('ee', 'C002')
('la', 'C002')
('mideeyay', 'C002')
('ee', 'C002')
('weinberg', 'C002')
('salam', 'C002')
('glashow', 'C002')
('daroogada', 'C003')
('holland', 'C003')
('waa', 'C003')
('maxay', 'C003')
('xeerka', 'C003')
('daroogada', 'C003')
('ee', 'C003')
('netherlands', 'C003')
('fatahaadaha', 'C004')
('yurub', 'C004')
('hel', 'C004')
('dokumintiyada', 'C004')
('bixiyaa', 'C004')
('qiyaasaha', 'C004')
('qarashaadka', 'C004')
('dhaqaalaha', 'C004')
('ee', 'C004')
('waxyeelada', 'C

REDUCE:

In [62]:
from collections import defaultdict

# Reduce Function
def reduce_function(mapped_results):
    reduced_results = defaultdict(lambda: defaultdict(int))  # Nested dictionary to hold counts

    for term, query_id in mapped_results:
        reduced_results[term][query_id] += 1  # Increment count for (term, query_id)

    # Format output as specified
    final_output = []
    for term, query_counts in reduced_results.items():
        count_str = ', '.join(f"{qid}:{count}" for qid, count in query_counts.items())
        final_output.append((term, count_str))

    return final_output

# Main function to demonstrate reducing
def main_reduce(mapped_results):
    # Call the reduce function
    reduced_results = reduce_function(mapped_results)
    print("\nReduced Results:")
    for term, count in reduced_results:
        print(f"({term},({count}))")  # Output the reduced results

if __name__ == "__main__":
    # Example of how to use the main_reduce function
    # Assume mapped_results comes from the map_function in main_map
    
    main_reduce(mapped_results)



Reduced Results:
(qaabdhismeedka,(C001:2))
(berlin,(C001:2, C142:1))
(hel,(C001:1, C002:1, C004:1, C007:1, C008:1, C011:1, C012:1, C016:1, C018:1, C019:1, C021:1, C031:1, C032:1, C033:1, C034:1, C037:1, C039:1, C041:1, C042:1, C043:1, C045:1, C050:1, C051:1, C052:1, C054:1, C055:1, C056:1, C057:1, C059:1, C060:1, C061:1, C062:1, C063:1, C064:1, C065:1, C066:1, C067:1, C068:1, C070:1, C071:1, C074:1, C075:1, C079:1, C081:1, C082:1, C083:1, C085:1, C086:1, C087:1, C088:1, C089:1, C093:1, C094:1, C095:1, C097:1, C099:1, C100:1, C102:1, C103:1, C105:1, C106:1, C111:1, C113:1, C114:1, C116:1, C119:1, C122:1, C123:1, C124:1, C126:1, C128:1, C131:1, C132:1, C133:1, C134:1, C137:1, C138:1, C139:1, C141:1, C142:1, C145:1, C147:1, C149:1, C150:1, C152:1, C153:1, C154:1, C158:1, C161:1, C162:1, C163:1, C167:1, C169:1, C170:1, C173:1, C174:1, C175:1, C176:1, C178:1, C181:1, C182:1, C184:1, C187:1, C188:1, C190:1, C193:1, C194:1, C195:1, C196:1, C197:1, C198:1, C199:1, C200:1))
(dokumintiyada,(C00