<a href="https://colab.research.google.com/github/SathishDissanayaka/Information-Retrieval-Web-Analytics-Labs/blob/main/IRWA_Lab03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
#inverted_index part a
import os

def build_inverted_index(folder_path):
    inverted_index = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename) #os.path.join() creates the full file path for the current file.
        doc_id = filename.split('.')[0]
        #takes the filename (like doc1.txt) and removes the extension, leaving just doc1.
        with open(file_path, 'r', encoding='utf-8') as file:
        #Opens the file in read mode ('r') with UTF-8 encoding
            content = file.read()
            #reads the entire text of the document
            words = content.split()
            #Splits the text into a list of words using spaces as separators.
            for word in words:
                if word not in inverted_index:
                    inverted_index[word] = []
                    #create a new entry with an empty list
                if doc_id not in inverted_index[word]:
                    inverted_index[word].append(doc_id)
                    #if the document ID is not already listed for this word, add it.
    return inverted_index

folder_path = '/content/drive/MyDrive/inverted'
inverted_index = build_inverted_index(folder_path)
print(inverted_index)


{'breakthrough': ['Doc1'], 'drug': ['Doc1', 'Doc2'], 'for': ['Doc1', 'Doc3', 'Doc4'], 'schizophrenia': ['Doc1', 'Doc2', 'Doc3', 'Doc4'], 'new': ['Doc2', 'Doc3', 'Doc4'], 'approach': ['Doc3'], 'treatment': ['Doc3'], 'of': ['Doc3'], 'hopes': ['Doc4'], 'patients': ['Doc4']}


In [32]:
#inverted_index part b)1 {schizophrenia AND drug }
def AND_op(list1,list2):
 return set(list1).intersection(set(list2))
 #set() → makes it easy and fast to find common documents for the AND operation.

In [33]:
for key in inverted_index:
 if key=='schizophrenia':
  List1=inverted_index[key]
  #List1 is the list of document IDs where the word "schizophrenia" appears.
 if key=='drug':
  List2=inverted_index[key]
  #document IDs where the word "drug" appears.

In [34]:
print(AND_op(List1,List2))

{'Doc1', 'Doc2'}


In [35]:
#inverted_index part b)2 {for AND NOT(drug OR approach) }
def OR_op(list1,list2):
 return set(list1).union(set(list2))
 #union() gives the Boolean OR result
def NOT_op(a,b):
 return set(b)-set(a)

In [36]:
fileList=[name.split(".")[0] for name in os.listdir(folder_path)]
#Extracts the document ID (without the .txt extension).
#This represents all documents in your corpus.
fileList

['Doc1', 'Doc2', 'Doc3', 'Doc4']

In [37]:
for key in inverted_index:
 if key=='drug':
  List3=inverted_index[key] #documents containing "drug"
 if key=='approach':
  List4=inverted_index[key] #documents containing "approach"
 if key=='for':
  List5=inverted_index[key] #documents containing "for"

In [38]:
List6=OR_op(List3,List4) #List6 contains documents that have either "drug" OR "approach"
List7=NOT_op(List6,fileList) #List7 contains documents that do NOT have "drug" or "approach"
List8=AND_op(List5,List7)
#Have the word "for"
#AND do not contain "drug" or "approach"
print(List8)

{'Doc4'}


In [39]:
#Question 2(Positional Index)
import os
def build_positional_index(folder_path):
  positional_index = {}
  for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    doc_id = filename.split('.')[0]

    with open(file_path, 'r', encoding='utf-8') as file:
      content = file.read()
      words = content.split()
      for position, word in enumerate(words, start=1): #Loops through each word along with its position in the document.
      #enumerate(words, start=1) → gives (position, word) pairs starting from 1 (so first word is position 1).
        if word not in positional_index:
          positional_index[word] = {}
        if doc_id not in positional_index[word]:
          positional_index[word][doc_id] = []
        positional_index[word][doc_id].append(position)
        #Append the current position of the word in this document to the list.
  return positional_index

In [40]:
inverted_index

{'breakthrough': ['Doc1'],
 'drug': ['Doc1', 'Doc2'],
 'for': ['Doc1', 'Doc3', 'Doc4'],
 'schizophrenia': ['Doc1', 'Doc2', 'Doc3', 'Doc4'],
 'new': ['Doc2', 'Doc3', 'Doc4'],
 'approach': ['Doc3'],
 'treatment': ['Doc3'],
 'of': ['Doc3'],
 'hopes': ['Doc4'],
 'patients': ['Doc4']}

In [41]:
folder_path = '/content/drive/MyDrive/positional/positional'
positional_index = build_positional_index(folder_path)

In [42]:
positional_index

{'\ufeffRemarks': {'doc_1': [1], 'doc_3': [1]},
 'circus': {'doc_1': [2]},
 'Announcing': {'doc_1': [3]},
 'Candidacy': {'doc_1': [4]},
 'for': {'doc_1': [5],
  'doc_2': [93, 138, 142, 260, 262, 265],
  'doc_3': [13, 26, 30, 51, 201, 254]},
 'President': {'doc_1': [6], 'doc_3': [219, 302]},
 'in': {'doc_1': [7, 42, 118, 148, 201],
  'doc_2': [7, 106, 126, 216, 242, 294, 300],
  'doc_3': [10, 295, 305, 308, 355]},
 'New': {'doc_1': [8, 46], 'doc_2': [48]},
 'York': {'doc_1': [9], 'doc_2': [49]},
 'City': {'doc_1': [10], 'doc_2': [50]},
 'Trump:': {'doc_1': [11]},
 'Wow.': {'doc_1': [12]},
 'Whoa.': {'doc_1': [13]},
 'That': {'doc_1': [14], 'doc_3': [124]},
 'is': {'doc_1': [15, 57, 117],
  'doc_2': [229, 258, 274, 328, 333, 348],
  'doc_3': [69]},
 'some': {'doc_1': [16, 70]},
 'group': {'doc_1': [17]},
 'of': {'doc_1': [18, 71],
  'doc_2': [28, 60, 98, 103, 115, 128, 199, 222, 268, 283],
  'doc_3': [76, 114, 129, 145, 218, 311, 347]},
 'people.': {'doc_1': [19]},
 'Thousands.So': {'doc