<a href="https://colab.research.google.com/github/Rahuldj2/IR-Assignment-2/blob/main/IR_ASSN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import math
import re
from collections import defaultdict, Counter

# Preprocessing: Tokenization and normalization (lowercasing and removing punctuation)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    tokens = text.split()
    return tokens

# Building the inverted index and document lengths
def build_index(corpus_path):
    index = defaultdict(list)
    doc_lengths = {}
    N = 0  # Total number of documents
    doc_ids = {}

    for i, filename in enumerate(os.listdir(corpus_path)):
        if filename.endswith('.txt'):
            N += 1
            doc_id = i + 1
            doc_ids[filename] = doc_id
            with open(os.path.join(corpus_path, filename), 'r', encoding='utf-8') as f:
                content = f.read()
                tokens = preprocess(content)
                token_counts = Counter(tokens)

                # Add terms to the index
                for term, freq in token_counts.items():
                    index[term].append((doc_id, freq))

                # Compute the document length (for normalization later)
                length = 0
                for term, freq in token_counts.items():
                    length += (1 + math.log10(freq))**2
                doc_lengths[doc_id] = math.sqrt(length)

    return index, doc_lengths, N, doc_ids

# Calculating cosine similarity between the query and documents
def cosine_similarity(query_vec, doc_vec, doc_lengths):
    scores = defaultdict(float)
    for term, qtf_idf in query_vec.items():
        if term in doc_vec:
            for doc_id, dtf_idf in doc_vec[term]:
                scores[doc_id] += qtf_idf * dtf_idf

    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]  # Normalize by document length

    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))

# Vector space model with lnc.ltc ranking
def vsm_search(query, index, doc_lengths, N, doc_ids):
    # Preprocess the query
    query_tokens = preprocess(query)
    query_freqs = Counter(query_tokens)

    # Compute tf-idf for query
    query_vec = {}
    for term, freq in query_freqs.items():
        if term in index:
            df = len(index[term])  # Document frequency
            idf = math.log10(N / df)
            query_vec[term] = (1 + math.log10(freq)) * idf

    # Compute tf-idf for documents
    doc_vec = defaultdict(list)
    for term, postings in index.items():
        df = len(postings)
        idf = math.log10(N / df)
        for doc_id, freq in postings:
            tf_idf = (1 + math.log10(freq)) * idf
            doc_vec[term].append((doc_id, tf_idf))

    # Rank documents by cosine similarity
    ranked_docs = cosine_similarity(query_vec, doc_vec, doc_lengths)

    # Convert doc IDs back to filenames and return top 10 results
    ranked_files = [(list(doc_ids.keys())[list(doc_ids.values()).index(doc_id)], score) for doc_id, score in ranked_docs[:10]]
    return ranked_files

# Main function to run the search engine
def main():
    corpus_path = 'corpus'
    index, doc_lengths, N, doc_ids = build_index(corpus_path)

    while True:
        query = input("Enter your query (or 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        results = vsm_search(query, index, doc_lengths, N, doc_ids)
        if results:
            print("\nTop results:")
            for i, (filename, score) in enumerate(results, 1):
                print(f"{i}. {filename} (Score: {score})")
        else:
            print("No documents match the query.")

if __name__ == '__main__':
    main()


Enter your query (or 'exit' to quit): Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Top results:
1. zomato.txt (Score: 0.5217394992745741)
2. swiggy.txt (Score: 0.30380894525158864)
3. instagram.txt (Score: 0.11750733468677174)
4. messenger.txt (Score: 0.11032477077510809)
5. youtube.txt (Score: 0.09703346075631504)
6. HP.txt (Score: 0.08798887157099794)
7. reddit.txt (Score: 0.07835569065197673)
8. flipkart.txt (Score: 0.06326544408856068)
9. Uber.txt (Score: 0.062202918107177846)
10. shakespeare.txt (Score: 0.05933212325654604)
Enter your query (or 'exit' to quit): Warwickshire, came from an ancient family and was the heiress to some land

Top results:
1. shakespeare.txt (Score: 0.4632969542538422)
2. zomato.txt (Score: 0.056159806661941813)
3. levis.txt (Score: 0.049038313030313196)
4. Adobe.txt (Score: 0.045391726820548456)
5. nike.txt (Score: 0.04371198906774426)
6. huawei.txt (Score: 0.03376035555034407)
7. Dell.t

In [2]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# for preprocessing steps initializing stemming, lemmatizers etc
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


#IN THIS STEP TEXT IS FIRST CONVERTED TO LOWERCASE THEN LEMMATIZATION IS APPLIED AFTER REMOVING STOPWORDS
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    tokens = text.split()

    # Apply stemming and lemmatization, remove stopwords
    tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in tokens if token not in stop_words]
    return tokens


def build_index(corpus_path):
    index = defaultdict(list)
    doc_lengths = {}
    N = 0  # Total number of documents
    doc_ids = {}

    for i, filename in enumerate(os.listdir(corpus_path)):
        if filename.endswith('.txt'):
            N += 1
            doc_id = i + 1
            doc_ids[filename] = doc_id
            with open(os.path.join(corpus_path, filename), 'r', encoding='utf-8') as f:
                content = f.read()
                tokens = preprocess(content)
                token_counts = Counter(tokens)

                # Add terms to the index
                for term, freq in token_counts.items():
                    index[term].append((doc_id, freq))

                # Compute the document length (for normalization later, lnc scheme: log(tf) and normalize)
                length = 0
                for term, freq in token_counts.items():
                    length += (1 + math.log10(freq))**2
                doc_lengths[doc_id] = math.sqrt(length)

    return index, doc_lengths, N, doc_ids

# Calculating cosine similarity between the query and documents
#WROTE CODE ACCORDING TO FUNCTION GIVEN IN SLIDES
# COSINESCORE(q)
# 1 float Scores[N] = 0
# 2 float Length[N]
# 3 **for** each query term t
# 4 **do** calculate wt,q and fetch postings list for t
# 5 **for** each pair(d, tft,d) in postings list
# 6 **do** Scores[d]+ = Wt,d × Wt,q
# 7 Read the array Length
# 8 **for** each d
# 9 **do** Scores[d] = Scores[d]/Length[d]
# 10 **return** Top K components of Scores[]
def cosine_similarity(query_vec, doc_vec, doc_lengths):
    scores = defaultdict(float)
    for term, qtf_idf in query_vec.items():
        if term in doc_vec:
            for doc_id, dtf_idf in doc_vec[term]:
                scores[doc_id] += qtf_idf * dtf_idf

    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]  # Normalize by document length

    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))

# Vector space model with lnc.ltc ranking
def vsm_search(query, index, doc_lengths, N, doc_ids):
    # the query is preprocessed so that lnc.ltn similarity can be calculated appropriately
    query_tokens = preprocess(query)
    query_freqs = Counter(query_tokens)

    # Compute tf-idf for query using ltc (log tf, idf, no normalization)
    query_vec = {}
    for term, freq in query_freqs.items():
        if term in index:
            df = len(index[term])  # Document frequency
            idf = math.log10(N / df)
            query_vec[term] = (1 + math.log10(freq)) * idf

    # Compute tf for documents using lnc (log tf, no idf, normalized by length)
    doc_vec = defaultdict(list)
    for term, postings in index.items():
        for doc_id, freq in postings:
            tf = (1 + math.log10(freq))  # No idf, just log(tf)
            doc_vec[term].append((doc_id, tf))

    # Rank documents by cosine similarity
    ranked_docs = cosine_similarity(query_vec, doc_vec, doc_lengths)

    # Convert doc IDs back to filenames and return top 10 results
    ranked_files = [(list(doc_ids.keys())[list(doc_ids.values()).index(doc_id)], score) for doc_id, score in ranked_docs[:10]]
    return ranked_files

# Main function to run the search engine
def main():
    corpus_path = 'corpus'
    index, doc_lengths, N, doc_ids = build_index(corpus_path)

    while True:
        query = input("Enter your query (or 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        results = vsm_search(query, index, doc_lengths, N, doc_ids)
        if results:
            print("\nTop results:")
            for i, (filename, score) in enumerate(results, 1):
                print(f"{i}. {filename} (Score: {score})")
        else:
            print("No documents match the query.")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Enter your query (or 'exit' to quit): Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Top results:
1. zomato.txt (Score: 0.5488501501085257)
2. swiggy.txt (Score: 0.3351914447855742)
3. instagram.txt (Score: 0.15470647490300268)
4. messenger.txt (Score: 0.1512386655130801)
5. youtube.txt (Score: 0.14940567205185346)
6. Discord.txt (Score: 0.1364887404785657)
7. bing.txt (Score: 0.13235296907216795)
8. paypal.txt (Score: 0.12035497736893036)
9. reddit.txt (Score: 0.11274273968889387)
10. flipkart.txt (Score: 0.10410504117596259)
Enter your query (or 'exit' to quit): Warwickshire, came from an ancient family and was the heiress to some land

Top results:
1. shakespeare.txt (Score: 0.37902335875841886)
2. levis.txt (Score: 0.07626953861676306)
3. Adobe.txt (Score: 0.07155668757671003)
4. google.txt (Score: 0.06551292559905736)
5. nike.txt (Score: 0.06069062772402075)
6. zomato.txt (Score: 0.05595827061016485)
7. huawei.txt (

KeyboardInterrupt: Interrupted by user