<h2>All imports</h2>

In [None]:
import numpy as np
import pandas as pd
import string
import time
import json
import re
import polars as pl
from collections import defaultdict
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

<h2>Reading data from CSV into a pandas data frame</h2>

In [8]:
corpus = pd.read_csv("Dataset/abcnews-date-text.csv", encoding='utf-8')
corpus.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


<h2>Pre processing of corpus</h2>
<font color='grey'>Tokenizing, removing stop words, stemming or lemmatizing the words, and possibly other techniques like removing punctuation or normalizing capitalization.</font>

In [13]:
# nltk.download('all')

# Tokenization
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: nltk.word_tokenize(x))   

# Lowercasing
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: [word.lower() for word in x])

# Remove punctuation
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: [word for word in x if word not in string.punctuation])

# Remove stopwords
stop_words = set(stopwords.words('english'))
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join tokens back into sentences
corpus['headline_text'] = corpus['headline_text'].apply(lambda x: ' '.join(x))


<h2>Inverted Index</h2>

In [15]:
corpus['tokens'] = corpus['headline_text'].apply(lambda x: nltk.word_tokenize(x))
corpus['tokens'] = corpus['tokens'].apply(lambda x: [word.lower() for word in x])
corpus['tokens'] = corpus['tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])
stop_words = set(stopwords.words('english'))
corpus['tokens'] = corpus['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
lemmatizer = WordNetLemmatizer()
corpus['tokens'] = corpus['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Build an inverted index
inverted_index = {}
for idx, tokens in corpus['tokens'].iteritems():
    for token in tokens:
        if token not in inverted_index:
            inverted_index[token] = []
        if idx not in inverted_index[token]:
            inverted_index[token].append(idx)


  for idx, tokens in corpus['tokens'].iteritems():


<h2>Biword Index</h2>

In [106]:
biword_index = {}
for index, tokens in corpus['tokens'].items():

    # create pairs of consecutive words (biwords)and loop through them
    biwords = [tokens[i] + ' ' + tokens[i+1] for i in range(len(tokens)-1)]
    for i, biword in enumerate(biwords):
        if biword in biword_index:
            biword_index[biword].append((index, i))
        else:
            biword_index[biword] = [(index, i)]

<h2>Query Pre Processing</h2>

In [49]:
def preprocess_query(query):
    # Tokenize query
    query_terms = nltk.word_tokenize(query)

    # Lowercasing
    query_terms = list(word.lower() for word in query_terms)

    # Remove punctuation
    query_terms = list(word for word in query_terms if word not in string.punctuation)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    query_terms = list(word for word in query_terms if word not in stop_words)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    query_terms = list(lemmatizer.lemmatize(word) for word in query_terms)

    # Identify any multi-term phrases in the query
    phrases = []
    i = 0
    while i < len(query_terms):
        if i < len(query_terms) - 1 and query_terms[i+1] == ' '.join([query_terms[i], query_terms[i+1]]):
            phrases.append(' '.join([query_terms[i], query_terms[i+1]]))
            i += 2
        else:
            phrases.append(query_terms[i])
            i += 1

    return phrases

<h2>Boolean Retrieval</h2>
<font color='grey'>Boolean retrieval model using the inverted index created above. It takes a user query string, tokenizes it, and uses the inverted index to find relevant documents that match the query terms using the boolean "AND" operator. It also supports multi-phrase queries by treating consecutive terms as a phrase and searching for that phrase in the corpus.</font>

In [107]:
def boolean_search_inverted(query, inverted_index):
    phrases = preprocess_query(query)

    # Loop over query terms
    for term in phrases:
        if term not in inverted_index:
            continue
        
        # Get set of documents that contain the term
        docs_with_term = set(inverted_index[term])

        # If first term in query, add all matching documents to set
        if len(matching_docs) == 0:
            matching_docs = docs_with_term
        else:
            # Intersection with previous matching documents
            matching_docs = matching_docs.add(docs_with_term)
            #matching_docs = matching_docs.intersection(docs_with_term)

    # Return list of matching document ids
    return list(matching_docs)

In [None]:
ps = PorterStemmer()
def boolean_search(query, inverted_index,biword_index):   
    query_terms = preprocess_query(query)
    biword_query = []
    for i in range(len(query_terms)-1):
        biword_query.append(ps.stem(query_terms[i]) + ' ' + ps.stem(query_terms[i+1]))
    relevant_docs = []
    for biword in biword_query:
        if biword in biword_index:
            relevant_docs.extend(biword_index[biword])
    relevant_docs = list(set(relevant_docs))
    result_docs = defaultdict(int)
    for doc_id in relevant_docs:
        doc_dict = inverted_index[doc_id]
        flag = True
        for term in query_terms:
            if term not in doc_dict:
                flag = False
                break
        if flag:
            result_docs[doc_id] = sum([doc_dict[term]['tf-idf'] for term in query_terms])
    if len(result_docs) == 0:
        return []
    else:
        return [k for k, v in sorted(result_docs.items(), key=lambda item: item[1], reverse=True)]

<h2>Wildcard Queries</h2>


In [None]:
def wildcard_search(wildcard_query):
    pattern = re.compile(wildcard_query.replace("*", ".*"))

    matching_terms = [term for term in inverted_index.keys() if re.match(pattern, term)]

    merged_posting_list = set()
    for term in matching_terms:
        posting_list = inverted_index[term]
        merged_posting_list |= set(posting_list.keys())
        return merged_posting_list

In [None]:
wildcard_query = "machine*"
res=wildcard_search(wildcard_query)
res

<h2>Similarity Based Retreival</h2>
<font color='grey'>Similarity based retrieval model that would need to calculate a similarity score between the user query and each document in the corpus. The documents could then be ranked based on their similarity scores, and the top results returned to the user. </font>

In [87]:
def retrieve_documents(query):
    processed_query = ' '.join(preprocess_query(query))

    # Initialize TF-IDF vectorizer & Create a list of all document texts
    vectorizer = TfidfVectorizer()
    document_texts = corpus['headline_text'].tolist()

    # Fit vectorizer on document texts & Transform query into a TF-IDF vector
    vectorizer.fit(document_texts)
    query_vector = vectorizer.transform([processed_query])

    # Transform all document texts into TF-IDF vectors & Calculate cosine similarity between query and documents
    document_vectors = vectorizer.transform(document_texts)
    similarity_scores = cosine_similarity(query_vector, document_vectors)
    [similarity_score] = similarity_scores.tolist()
    document_id = {i:similarity_score[i] for i in range(0,len(similarity_score))}
    #print(document_id)
    # Sort documents by similarity score in descending order
    # Return sorted list of document IDs and corresponding similarity scores
    sorted_dict = sorted(document_id.items(), key=lambda x: x[1], reverse=True)
    return sorted_dict[:1000]

<h2>Query for both models</h2>

In [None]:
query = "climate changes"
results = boolean_search(query, inverted_index)
#Top 10 reaches
print(results[:1000])

In [None]:
query = "climate changes"
similar_documents = retrieve_documents(query)
similar_documents

In [85]:
corpus.iloc[393216]['headline_text']

'climate change fight need political ardour'

<h2>Evaluation Of Metrics</h2>


In [104]:
def evaluate_search_engine(relevant_docs, retrieved_docs):

    relevant_set = set(relevant_docs)
    retrieved_set = set(retrieved_docs)
    
    #print('Relevant : ',sorted(relevant_set))
    #print('Retrieved : ',sorted(retrieved_set))

    true_positives = len(relevant_set.intersection(retrieved_set))
    false_positives = len(retrieved_set - relevant_set)
    false_negatives = len(relevant_set - retrieved_set)

    print("TP: ",true_positives)
    print("FP: ",false_positives)
    print("FN: ",false_negatives)
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1_score

In [105]:
evaluate_search_engine(list(similar_documents[i][0] for i in range(1000)),results[:1000])

TP:  246
FP:  754
FN:  754


(0.246, 0.246, 0.246)

<h2>Advanced Search</h2>


<h2>Advanced Search Queries</h2>


<h2>Semantic Matching</h2>


In [None]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

# define a function to calculate semantic similarity between two words using WordNet
def calculate_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return 0.0
    max_sim = -1
    for synset1 in synsets1:
        for synset2 in synsets2:
            sim = wn.path_similarity(synset1, synset2)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim

# define a function to perform semantic matching of a query against a document
def semantic_matching(query):
    matching_terms = []
    for token in query.split():
        if token in inverted_index:
            matching_terms.append(token)

    scores = []
    for document in inverted_index:
        doc_scores = []
        for term in matching_terms:
            if term in positional_index[document]:
                doc_scores.append(max([calculate_similarity(term, doc_token) for doc_token in positional_index[document][term]]))
            else:
                doc_scores.append(0.0)
        scores.append((document, sum(doc_scores)/len(matching_terms)))
    return scores
