In [None]:
from google.colab import drive
import nltk
import re
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [None]:
!unzip /content/MyDrive/MyDrive/Humor,Hist,Media,Food.zip > /dev/null

In [None]:
dataset_path='/content/Humor,Hist,Media,Food'
alldirs = [dataset_path+'/'+d for d in os.listdir(os.path.join(dataset_path))]

In [None]:
alldirs.sort()

In [None]:
dataset_path='/content/Humor,Hist,Media,Food'
alldirs = [dataset_path+'/'+d for d in os.listdir(os.path.join(dataset_path))]
FileContents=[]

for path in alldirs:
  file = open(path,'rb')
  FileContents.append(file.read())

# Q-2 (a) Preprocessing

##Converting text to lowercase

In [None]:
for i in range(len(FileContents)):
  FileContents[i]=str(FileContents[i].lower()).replace("\\n","").replace("\\r","").replace("\\t","").strip()


##Removing Special Characters

In [None]:
for i in range(len(FileContents)):
  FileContents[i]=str(re.sub('[^A-Za-z0-9]+', ' ',FileContents[i]))

##Word Tokenization

In [None]:
nltk_tokens=[]

for i in range(len(FileContents)):
  nltk_tokens.append(nltk.word_tokenize(FileContents[i]))

##Remove Stop Words

In [None]:
stop_words = set(stopwords.words('english'))

# Use this to read file content as a stream:
preprocessed=[]
for i in nltk_tokens:
	temp=[]
	for r in i:
		if not r in stop_words:
			temp.append(r)
	preprocessed.append(temp)


# Q-2 (b) Creating Positional Index Data Structure 

In [None]:
#positional Index data Structure 
positionalIndex={}

for doc_id in range(len(preprocessed)):
  document = preprocessed[doc_id]
  for index in range(len(document)):
    word = document[index]

    if(word in positionalIndex):
      doc_dict = positionalIndex[word]
      if(doc_id in doc_dict):
        doc_dict[doc_id].append(index)
      else:
        doc_dict[doc_id]=[index]
      positionalIndex[word]=doc_dict

    else:
      doc_dict = {}
      doc_dict[doc_id]=[index]
      positionalIndex[word]=doc_dict


#Q-2 (c) Taking input the phrase queries

##Preprocessing the query

In [None]:
def preprocessingQuery(query):
  query = query.lower().replace("\\n","").replace("\\r","").replace("\\t","").strip()
  
  # word tokenization
  temp_words = nltk.word_tokenize(query)

  # removing the stop words
  words=[]
  stop_words = set(stopwords.words('english'))
  for word in temp_words:
	  if word not in stop_words:
		  words.append(word)
  
  return words

In [None]:
def retrievingDocuments(query_index,query_words,document_id,word_index,positionalIndex,retrieved_doc_ids):
  if(query_index==len(query_words)):
    retrieved_doc_ids.add(document_id)
    return
  
  if(query_index==0):
    # retrieving all the documents that contains the word query_words[query_index]
    doc_dict = positionalIndex[query_words[query_index]]

    if(doc_dict is not None):
      # retrieving individual documents containing the word query_words[query_index]

      for doc_id in doc_dict.keys():
        # retrieving the position of the word query_words[query_index] inside the document doc_id

        index_list = doc_dict[doc_id]
        for index in index_list:
          # taking each index as our starting point of the phrase query

          retrievingDocuments(query_index+1,query_words,doc_id,index+1,positionalIndex,retrieved_doc_ids)
  else:
    doc_dict = positionalIndex[query_words[query_index]]

    if(document_id in doc_dict):
      index_list = doc_dict[document_id]
      
      if(word_index in index_list):
        retrievingDocuments(query_index+1,query_words,document_id,word_index+1,positionalIndex,retrieved_doc_ids)


## Taking input phrase queries

In [None]:
phrase_queries = ['turbo encabulator', 'usual thing', 'further salute']

for query in phrase_queries:
  query_words=preprocessingQuery(query)
  
  pos=0
  retrieved_doc_ids = set()
  retrievingDocuments(0,query_words,-1,-1,positionalIndex,retrieved_doc_ids)

  retrieved_doc_names=[]
  for doc_id in sorted(retrieved_doc_ids):
    retrieved_doc_names.append(alldirs[doc_id].split('/')[-1])

  print("Actual Phrase Query: ",query)
  print("Preprocessed Phrase Query words: ",query_words)
  print("Number of documents retrieved: ",len(retrieved_doc_names))
  print("List of the document names retrieved: \n",retrieved_doc_names)
  print("\n")


Actual Phrase Query:  turbo encabulator
Preprocessed Phrase Query words:  ['turbo', 'encabulator']
Number of documents retrieved:  1
List of the document names retrieved: 
 ['turbo.hum']


Actual Phrase Query:  usual thing
Preprocessed Phrase Query words:  ['usual', 'thing']
Number of documents retrieved:  2
List of the document names retrieved: 
 ['critic.txt', 'banana01.brd']


Actual Phrase Query:  further salute
Preprocessed Phrase Query words:  ['salute']
Number of documents retrieved:  10
List of the document names retrieved: 
 ['turbo.hum', 'fwksfun.hum', 'prover.wisom', 'arnold.txt', 'fireplacein.txt', 'mlverb.hum', 'reconcil.hum', 'idaho.txt', 'prover_w.iso', 'oliver02.txt']


