<a href="https://colab.research.google.com/github/Rahuldj2/IR-Assignment-2/blob/main/IR_ASSN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# for preprocessing steps initializing stemming, lemmatizers etc
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


#IN THIS STEP TEXT IS FIRST CONVERTED TO LOWERCASE THEN LEMMATIZATION IS APPLIED AFTER REMOVING STOPWORDS
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    tokens = text.split()

    # Apply stemming and lemmatization, remove stopwords
    tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in tokens if token not in stop_words]
    return tokens


def build_index(corpus_path):
    index = defaultdict(list)
    doc_lengths = {}
    N = 0  # Total number of documents
    doc_ids = {}

    for i, filename in enumerate(os.listdir(corpus_path)):
        if filename.endswith('.txt'):
            N += 1
            doc_id = i + 1
            doc_ids[filename] = doc_id
            with open(os.path.join(corpus_path, filename), 'r', encoding='utf-8') as f:
                content = f.read()
                tokens = preprocess(content)
                token_counts = Counter(tokens)

                # Add terms to the index
                for term, freq in token_counts.items():
                    index[term].append((doc_id, freq))

                # Compute the document length (for normalization later, lnc scheme: log(tf) and normalize)
                length = 0
                for term, freq in token_counts.items():
                    length += (1 + math.log10(freq))**2
                doc_lengths[doc_id] = math.sqrt(length)

    return index, doc_lengths, N, doc_ids

# Calculating cosine similarity between the query and documents
#WROTE CODE ACCORDING TO FUNCTION GIVEN IN SLIDES
# COSINESCORE(q)
# 1 float Scores[N] = 0
# 2 float Length[N]
# 3 **for** each query term t
# 4 **do** calculate wt,q and fetch postings list for t
# 5 **for** each pair(d, tft,d) in postings list
# 6 **do** Scores[d]+ = Wt,d × Wt,q
# 7 Read the array Length
# 8 **for** each d
# 9 **do** Scores[d] = Scores[d]/Length[d]
# 10 **return** Top K components of Scores[]
def cosine_similarity(query_vec, doc_vec, doc_lengths):
    scores = defaultdict(float)
    for term, qtf_idf in query_vec.items():
        if term in doc_vec:
            for doc_id, dtf_idf in doc_vec[term]:
                scores[doc_id] += qtf_idf * dtf_idf

    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]  # Normalize by document length

    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))

# Vector space model with lnc.ltc ranking
def vsm_search(query, index, doc_lengths, N, doc_ids):
    # the query is preprocessed so that lnc.ltn similarity can be calculated appropriately
    query_tokens = preprocess(query)
    query_freqs = Counter(query_tokens)

    # Compute tf-idf for query using ltc (log tf, idf, no normalization)
    query_vec = {}
    for term, freq in query_freqs.items():
        if term in index:
            df = len(index[term])  # Document frequency
            idf = math.log10(N / df)
            query_vec[term] = (1 + math.log10(freq)) * idf

    # Compute tf for documents using lnc (log tf, no idf, normalized by length)
    doc_vec = defaultdict(list)
    for term, postings in index.items():
        for doc_id, freq in postings:
            tf = (1 + math.log10(freq))  # No idf, just log(tf)
            doc_vec[term].append((doc_id, tf))

    # Rank documents by cosine similarity
    ranked_docs = cosine_similarity(query_vec, doc_vec, doc_lengths)

    # Convert doc IDs back to filenames and return top 10 results
    ranked_files = [(list(doc_ids.keys())[list(doc_ids.values()).index(doc_id)], score) for doc_id, score in ranked_docs[:10]]
    return ranked_files

# Main function to run the search engine
def main():
    corpus_path = 'corpus'
    index, doc_lengths, N, doc_ids = build_index(corpus_path)

    while True:
        query = input("Enter your query (or 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            break
        results = vsm_search(query, index, doc_lengths, N, doc_ids)
        if results:
            print("\nTop results:")
            for i, (filename, score) in enumerate(results, 1):
                print(f"{i}. {filename} (Score: {score})")
        else:
            print("No documents match the query.")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


No documents match the query.
No documents match the query.


In [4]:
import os
import math
import re
from collections import defaultdict, Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

# for preprocessing steps initializing stemming, lemmatizers etc
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# IN THIS STEP TEXT IS FIRST CONVERTED TO LOWERCASE THEN LEMMATIZATION IS APPLIED AFTER REMOVING STOPWORDS
def preprocess(text):
    # text = text.lower()
    text = re.sub(r"\W+", " ", text)  # Remove non-word characters
    tokens = text.split()

    # Apply stemming and lemmatization, remove stopwords
    tokens = [
        lemmatizer.lemmatize(stemmer.stem(token))
        for token in tokens
        if token not in stop_words
    ]
    return tokens


def build_index(corpus_path):
    index = defaultdict(list)
    doc_lengths = {}
    N = 0  # Total number of documents
    doc_ids = {}

    for i, filename in enumerate(os.listdir(corpus_path)):
        if filename.endswith(".txt"):
            N += 1
            doc_id = i + 1
            doc_ids[filename] = doc_id
            with open(os.path.join(corpus_path, filename), "r", encoding="utf-8") as f:
                content = f.read()
                tokens = preprocess(content)
                token_counts = Counter(tokens)

                # Add terms to the index
                for term, freq in token_counts.items():
                    index[term].append((doc_id, freq))

                # Compute the document length (for normalization later, lnc scheme: log(tf) and normalize)
                length = 0
                for term, freq in token_counts.items():
                    length += (1 + math.log10(freq)) ** 2
                doc_lengths[doc_id] = math.sqrt(length)

    return index, doc_lengths, N, doc_ids


# Calculating cosine similarity between the query and documents
# WROTE CODE ACCORDING TO FUNCTION GIVEN IN SLIDES
# COSINESCORE(q)
# 1 float Scores[N] = 0
# 2 float Length[N]
# 3 **for** each query term t
# 4 **do** calculate wt,q and fetch postings list for t
# 5 **for** each pair(d, tft,d) in postings list
# 6 **do** Scores[d]+ = Wt,d × Wt,q
# 7 Read the array Length
# 8 **for** each d
# 9 **do** Scores[d] = Scores[d]/Length[d]
# 10 **return** Top K components of Scores[]
def cosine_similarity(query_vec, doc_vec, doc_lengths):
    scores = defaultdict(float)
    for term, qtf_idf in query_vec.items():
        if term in doc_vec:
            for doc_id, dtf_idf in doc_vec[term]:
                scores[doc_id] += qtf_idf * dtf_idf

    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]  # Normalize by document length

    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))


# Vector space model with lnc.ltc ranking
def vsm_search(query, index, doc_lengths, N, doc_ids):
    # the query is preprocessed so that lnc.ltn similarity can be calculated appropriately
    query_tokens = preprocess(query)
    query_freqs = Counter(query_tokens)

    # Compute tf-idf for query using ltc (log tf, idf, no normalization)
    query_vec = {}
    for term, freq in query_freqs.items():
        if term in index:
            df = len(index[term])  # Document frequency
            idf = math.log10(N / df)
            query_vec[term] = (1 + math.log10(freq)) * idf

    # Compute tf for documents using lnc (log tf, no idf, normalized by length)
    doc_vec = defaultdict(list)
    for term, postings in index.items():
        for doc_id, freq in postings:
            tf = 1 + math.log10(freq)  # No idf, just log(tf)
            doc_vec[term].append((doc_id, tf))

    # Rank documents by cosine similarity
    ranked_docs = cosine_similarity(query_vec, doc_vec, doc_lengths)

    # Convert doc IDs back to filenames and return top 10 results
    ranked_files = [
        (list(doc_ids.keys())[list(doc_ids.values()).index(doc_id)], score)
        for doc_id, score in ranked_docs[:10]
    ]
    return ranked_files


# Main function to run the search engine
def main():
    corpus_path = "corpus"
    index, doc_lengths, N, doc_ids = build_index(corpus_path)

    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land",
        "Their Design School, filled with free video courses on a wide range of topics, is a good place to start. There's a ",
    ]

    for query in queries:
        results = vsm_search(query, index, doc_lengths, N, doc_ids)
        if results:
            print(f"\nResults for query: '{query}'")
            for i, (filename, score) in enumerate(results, 1):
                print(f"{i}. {filename} (Score: {score})")
        else:
            print(f"No documents match the query: '{query}'")


if __name__ == "__main__":
    main()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Results for query: 'Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation'
1. zomato.txt (Score: 0.5293531487992237)
2. swiggy.txt (Score: 0.3175398730755558)
3. instagram.txt (Score: 0.14883350305333537)
4. messenger.txt (Score: 0.14665081801495392)
5. youtube.txt (Score: 0.14511951736250497)
6. Discord.txt (Score: 0.13150818490012786)
7. bing.txt (Score: 0.12748523022536348)
8. paypal.txt (Score: 0.11895867761593869)
9. reddit.txt (Score: 0.10875265715819195)
10. flipkart.txt (Score: 0.10235843650392648)

Results for query: 'Warwickshire, came from an ancient family and was the heiress to some land'
1. shakespeare.txt (Score: 0.37145786629278843)
2. levis.txt (Score: 0.07454142776109814)
3. Adobe.txt (Score: 0.07034525589618056)
4. google.txt (Score: 0.06421724189263736)
5. nike.txt (Score: 0.0582325631093714)
6. zomato.txt (Score: 0.05397044483451929)
7. huawei.txt (Score: 0.042458692963560585)
8. skype.txt (Score: 0.0359183