<a href="https://colab.research.google.com/github/ShivamThapa243/Information-Retrieval/blob/main/positional_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **POSITIONAL INVERTED INDEX**

# Building a Positional Inverted Index

In [19]:
# connecting to google drive to access the preprocessed data set

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Structure of Positional Inverted Index:**

    {
      token_1: [
          {
            'document_id' : x, 'position' : [position_1, position_2, position_3]
          },
          {
            'document_id' : y, 'position' : [position_4, position_5, position_6]
          },
          {
            'document_id' : z, 'position' : [position_7, position_8, position_9]
          }
      ],
      "caesar": [
          {
            'document_id' : 51, 'position' : [0, 7, 20]
          },
          {
            'document_id' : 79, 'position' : [67, 81]
          },
          {
            'document_id' : 103, 'position' : [4, 12, 51]
          }
      ], ...
    }

**Function to build a Positional Inverted Index**

In [20]:
import os

def positional_inverted_index_builder(dataset_directory):

  positional_inverted_index = {}

  list_of_files = os.listdir(dataset_directory)

  for filename in list_of_files:
    if filename.endswith(".txt"):
      # reading each .txt file present in the directory
      file_path = os.path.join(dataset_directory, filename)
      with open(file_path, 'r') as file:
        content = file.read()
        tokens = content.split()

        for position, token in enumerate(tokens):
          if token not in positional_inverted_index:
            positional_inverted_index[token] = [{'document_id': filename, 'positions': [position]}]
          else:
            # Check if the document entry already exists for this token
            doc_entry_found = False
            for entry in positional_inverted_index[token]:
              if entry['document_id'] == filename:
                entry['positions'].append(position)
                doc_entry_found = True
                break
            # If the document entry was not found, add a new entry
            if not doc_entry_found:
              positional_inverted_index[token].append({'document_id': filename, 'positions': [position]})


  return positional_inverted_index

**Invoking the positional_inverted_index_builder function to build the positional inverted index**

In [21]:
# fetching the preprocessed data files
dataset_directory = "/content/drive/MyDrive/Information Retrieval/preprocessed_data"

# positional-inverted_index_builder function called
positional_inverted_index = positional_inverted_index_builder(dataset_directory)

directory_path = "/content/drive/MyDrive/Information Retrieval"
text_file_name = "positional_inverted_index.txt"

# writing the positional inverted index to a text file

with open(os.path.join(directory_path, text_file_name), 'w') as file:
    for term in positional_inverted_index:
        file.write(f"{term} : \n")
        for entry in positional_inverted_index[term]:
          file.write(f"\tDocument ID :  {entry['document_id']}")
          file.write(f"\tPositions : {entry['positions']}\n")
        file.write("\n")

print("Positional Inverted index created")

Positional Inverted index created


Sorting the positional index:
1.  Positions in ascending order
2.  Document IDs in lexograpgical order


In [24]:
import re

def extract_numeric_part(document_name):
  pattern = re.compile(r'\d+')
  matches = pattern.search(document_name)
  return int(matches.group())


for term in positional_inverted_index:
  # Positions in ascending order
  for entry in positional_inverted_index[term]:
    entry['positions'].sort()

  # Document IDs in lexograpgical order
  positional_inverted_index[term] = sorted(positional_inverted_index[term], key = lambda x : extract_numeric_part(x['document_id']))

  # renaming the file in the original format
  for entry in positional_inverted_index[term]:
    if not entry['document_id'].startswith('file') and not entry['document_id'].endswith('.txt'):
      numeric_part = extract_numeric_part(entry['document-id'])
      entry['document_id'] = f"file{numeric_part}.txt"

Writing the sorted positional inverted index into a text file

In [23]:
sorted_file_name = "sorted_positional_inverted_index.txt"

with open(os.path.join(directory_path, sorted_file_name), 'w') as file:
  for term in positional_inverted_index:
    file.write(f"{term :}\n")
    for entry in positional_inverted_index[term]:
      file.write(f"\tDocument ID :  {entry['document_id']}")
      file.write(f"\tPositions : {entry['positions']}\n")
    file.write("\n")

print("Sorted Positional Inverted Index created")

Sorted Positional Inverted Index created


# Pickling Positional Inverted Index

In [25]:
import pickle

pickle_file_name = "positional_inverted_index.pkl"

with open(os.path.join(directory_path, pickle_file_name), 'wb') as file:
    pickle.dump(positional_inverted_index, file)
    print("Positional Inverted Index pickled")

Positional Inverted Index pickled


# Handling Query Operations

Retrieveing the documents that matches the tokens and position

In [None]:
def documents_retrieval_system(phrase_list, inverted_index):
  positional_index = {}

  for token in phrase_list:
    if token in inverted_index:
      positional_index.append(inverted_index[token])
    else :
      return []

  p1 = positional_index[0]

  for index in range(1, len(phrase_list)):
    p2 = positional_index[index]

    intermediate_positional_index = intersection(p1, p2)

    p1 = intermediate_positional_index

  return p1

def intersection(p1, p2):
  intersection_positional_index = {}
  while p1 and p2:
    if p1[0]['document_id'] == p2[0]['document_id']:
      doc_id = p1[0]['document_id']
      position_list = []

      position_list_p1 = p1[0]['positions']
      position_list_p2 = p2[0]['positions']

      index_p1 = 0
      index_p2 = 0

      while index_p1 < len(position_list_p1):
        while index_p2 < len(position_list_p2):
          if abs(position_list_p1[index_p1] - position_list_p2[index_p2]) == 1:
            position_list.append(index_p2)

          elif position_list_p2[index_p2] > position_list_p1[index_p1]:
            break
          index_p2 += 1

          intersection_positional_index.append({'document_id' : doc_id, 'positions' : position_list})
        index_p1 += 1
      p1.pop(0)
      p2.pop(0)

    elif extract_numeric_part(p1[0]['document_id'] < extract_numeric_part(p2[0]['docmuent_id'])):
      p1.pop(0)
    else:
      p2.pop(0)

  return intersection_positional_index



Preprocessing the string into tokens

In [None]:
import nltk
import string

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocessing_query(query):
    query = query.lower()
    tokens = word_tokenize(query)

    tokens = [word for word in tokens if word not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Passing query

In [None]:
# loading the pickled poitional inverted ubdex
import pickle

pickled_file = "/content/drive/MyDrive/Information Retrieval/positional_inverted_index.pkl"
inverted_index = {}
with open(pickled_file, 'rb') as file:
  inverted_index = pickle.load(file)
  print("Positional Inverted Index loaded")

print("Input phrase query : ")
query = input()

# preprocessing the query
query_token = preprocessing_query(query)
print("Preprocessed tokens : ")
print(query_token)

# passing the preprocessed tokens, to search in which document they matches
retrieved_docuents = documents_retrieval_system(query_token, inverted_index)
print("Retrieved focuments : ")
print(retrieved_docuents)

Positional Inverted Index loaded
Input phrase query : 


KeyboardInterrupt: Interrupted by user

# Note
sorting document_id in the positional inverted in lexiicographical prrder