## Arxive metadata search Engine 
### Team:
### PES1UG20CS006: Abhijeet Wadiyar
### PES1UG20CS040: Anant Gulati
### PES1UG20CS060: Anoushka Gupta
### PES1UG20CS061: Ansh CS

In [17]:
import numpy as np
import pandas as pd
import time
import json
import re
import polars as pl
from collections import defaultdict
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

**Reading data from a JSON into a pandas data frame**

In [18]:
df = pd.read_json('/kaggle/input/arxivdataset/arxivData.json')
df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


**Converting the pandas dataframe into a polars dataframe to increase efficiency of compuatations**

In [19]:
pldf = pl.from_pandas(df)

**Exctarcting relavent collumns**

In [20]:
pldf = pldf.select(pl.col('title'),pl.col('author'),pl.col('summary').apply(lambda x: x.replace("\n","").lower()),pl.col('year'))

In [21]:
pldf.head()

title,author,summary,year
str,str,str,i64
"""Dual Recurrent...","""[{'name': 'Ahm...","""we propose an ...",2018
"""Sequential Sho...","""[{'name': 'Ji ...","""recent approac...",2016
"""Multiresolutio...","""[{'name': 'Iul...","""we introduce t...",2016
"""Learning what ...","""[{'name': 'Seb...","""multi-task lea...",2017
"""A Deep Reinfor...","""[{'name': 'Iul...","""we present mil...",2017


In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Implementing Stemming**

In [24]:
stop_words = set(stopwords.words('english'))

#stemming
stemmer = PorterStemmer()
pldf = pldf.select(pl.col('title'),pl.col('author'),pl.col('summary').apply(lambda x: word_tokenize(x)),pl.col('year'))        

In [25]:
pldf = pldf.select(pl.col('title'),pl.col('author'),pl.col('summary').apply(lambda x: [word for word in x if word.casefold() not in stop_words] ),pl.col('year')) 

In [26]:
pldf = pldf.select(pl.col('title'),pl.col('author'),pl.col('summary').apply(lambda x: [stemmer.stem(word) for word in x] ),pl.col('year')) 

In [27]:
pldf.head()

title,author,summary,year
str,str,list[str],i64
"""Dual Recurrent...","""[{'name': 'Ahm...","[""propos"", ""architectur"", ... "".""]",2018
"""Sequential Sho...","""[{'name': 'Ji ...","[""recent"", ""approach"", ... "".""]",2016
"""Multiresolutio...","""[{'name': 'Iul...","[""introduc"", ""multiresolut"", ... "".""]",2016
"""Learning what ...","""[{'name': 'Seb...","[""multi-task"", ""learn"", ... "".""]",2017
"""A Deep Reinfor...","""[{'name': 'Iul...","[""present"", ""milabot"", ... "".""]",2017


**Creating Inverted index**

In [28]:
inverted_index = {}

# Loop over each document
for doc_id, doc in enumerate(pldf['summary']):
    # Create the index entries for each term in the document
    for position, term in enumerate(doc):
        if term not in inverted_index:
            inverted_index[term] = {}
        if doc_id not in inverted_index[term]:
            inverted_index[term][doc_id] = []
        inverted_index[term][doc_id].append(position)

In [31]:
merged_posting_list

{4,
 57,
 71,
 84,
 102,
 106,
 116,
 128,
 170,
 258,
 268,
 323,
 338,
 389,
 403,
 416,
 451,
 493,
 561,
 600,
 736,
 752,
 835,
 844,
 846,
 858,
 894,
 916,
 974,
 994,
 996,
 1005,
 1066,
 1089,
 1112,
 1143,
 1169,
 1243,
 1253,
 1254,
 1297,
 1474,
 1485,
 1559,
 1567,
 1618,
 1629,
 1651,
 1694,
 1707,
 1719,
 1820,
 1833,
 1850,
 1860,
 1864,
 1913,
 1929,
 2008,
 2012,
 2018,
 2076,
 2099,
 2114,
 2115,
 2126,
 2189,
 2208,
 2306,
 2321,
 2426,
 2498,
 2523,
 2571,
 2600,
 2711,
 2745,
 2804,
 2808,
 2930,
 2947,
 2994,
 3083,
 3084,
 3093,
 3097,
 3186,
 3187,
 3193,
 3198,
 3284,
 3358,
 3378,
 3420,
 3513,
 3546,
 3562,
 3621,
 3630,
 3635,
 3645,
 3663,
 3743,
 3778,
 3796,
 3832,
 3845,
 3860,
 4029,
 4107,
 4198,
 4263,
 4384,
 4403,
 4539,
 4551,
 4581,
 4582,
 4714,
 4724,
 4730,
 4829,
 4857,
 4890,
 4927,
 4946,
 5008,
 5087,
 5200,
 5212,
 5278,
 5320,
 5337,
 5366,
 5386,
 5464,
 5471,
 5620,
 5766,
 5890,
 5903,
 5939,
 5951,
 5959,
 6027,
 6032,
 6051,
 6187,


**Function to parse the query for boolean searches**

In [61]:
def parse_query(infix_tokens):
    precedence = {}
    precedence['NOT'] = 3
    precedence['AND'] = 2
    precedence['OR'] = 1
    precedence['('] = 0
    precedence[')'] = 0

    output = []
    operator_stack = []

    for token in infix_tokens:
        if token == '(':
            operator_stack.append(token)
        elif token == ')':
            operator = operator_stack.pop()
            while operator != '(':
                output.append(operator)
                operator = operator_stack.pop()
        elif token in precedence:
            while operator_stack and precedence[operator_stack[-1]] >= precedence[token]:
                output.append(operator_stack.pop())
            operator_stack.append(token)
        else:
            output.append(token.lower())

    while operator_stack:
        output.append(operator_stack.pop())

    return output


**Boolean search function**

In [62]:
def boolean_query(query, inverted_index):
    query = query.strip()
    query_tokens = query.split()
    boolean_query = parse_query(query_tokens)
    
    def evaluate(expression):
        if isinstance(expression, str):
            return set(inverted_index.get(expression, []))
        elif len(expression) == 1:
            return evaluate(expression[0])
        elif expression[0] == "AND":
            return set.intersection(*[evaluate(expr) for expr in expression[1:]])
        elif expression[0] == "OR":
            return set.union(*[evaluate(expr) for expr in expression[1:]])
        elif expression[0] == "NOT":
            complement = evaluate(expression[1])
            return set(doc_id for doc_id in inverted_index.keys() if doc_id not in complement)
    
    result = evaluate(boolean_query)
    return result


In [63]:
res=boolean_query("artificial AND intelligence",inverted_index)

In [64]:
res

**Implementing a function for wild card query search**

In [59]:
def wildcard_search(wildcard_query):
    pattern = re.compile(wildcard_query.replace("*", ".*"))

    matching_terms = [term for term in inverted_index.keys() if re.match(pattern, term)]

    merged_posting_list = set()
    for term in matching_terms:
        posting_list = inverted_index[term]
        merged_posting_list |= set(posting_list.keys())
        return merged_posting_list

In [60]:
wildcard_query = "machine*"
res=wildcard_search(wildcard_query)
res

{4,
 116,
 258,
 268,
 323,
 338,
 403,
 416,
 736,
 752,
 835,
 858,
 894,
 996,
 1066,
 1089,
 1112,
 1143,
 1254,
 1297,
 1618,
 1694,
 1850,
 1913,
 1929,
 2076,
 2114,
 2126,
 2189,
 2306,
 2321,
 2523,
 2571,
 2711,
 2804,
 2808,
 2947,
 3083,
 3093,
 3097,
 3186,
 3187,
 3193,
 3198,
 3284,
 3378,
 3513,
 3562,
 3621,
 3630,
 3635,
 3645,
 3663,
 3778,
 3796,
 3860,
 4029,
 4263,
 4403,
 4539,
 4581,
 4582,
 4714,
 4724,
 4829,
 4890,
 4927,
 4946,
 5008,
 5087,
 5200,
 5212,
 5278,
 5320,
 5464,
 5471,
 5766,
 5890,
 5951,
 5959,
 6027,
 6032,
 6051,
 6268,
 6285,
 6286,
 6294,
 6324,
 6339,
 6446,
 6527,
 6584,
 6688,
 6780,
 6917,
 6966,
 6996,
 7011,
 7017,
 7031,
 7065,
 7079,
 7128,
 7142,
 7171,
 7352,
 7361,
 7377,
 7400,
 7425,
 7452,
 7505,
 7542,
 7559,
 7679,
 7702,
 7708,
 7747,
 7766,
 7805,
 7819,
 7986,
 7994,
 8009,
 8012,
 8017,
 8188,
 8562,
 8594,
 8641,
 8789,
 8813,
 8829,
 8970,
 8988,
 9082,
 9094,
 9255,
 9263,
 9303,
 9364,
 9372,
 9397,
 9430,
 9569,
 

**similarity index implementation**

In [None]:
def retrieve_relevant_text(query):
    # Perform search and retrieve relevant documents
    relevant_docs = search(query)
    
    # Retrieve the relevant text from the documents
    relevant_text = [df.loc[df['id'] == doc]['Description'].values[0] for doc in relevant_docs]
    
    return relevant_text


**Semantic matching**

In [79]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

# define a function to calculate semantic similarity between two words using WordNet
def calculate_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return 0.0
    max_sim = -1
    for synset1 in synsets1:
        for synset2 in synsets2:
            sim = wn.path_similarity(synset1, synset2)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim

# define a function to perform semantic matching of a query against a document
def semantic_matching(query):
    matching_terms = []
    for token in query.split():
        if token in inverted_index:
            matching_terms.append(token)

    scores = []
    for document in inverted_index:
        doc_scores = []
        for term in matching_terms:
            if term in positional_index[document]:
                doc_scores.append(max([calculate_similarity(term, doc_token) for doc_token in positional_index[document][term]]))
            else:
                doc_scores.append(0.0)
        scores.append((document, sum(doc_scores)/len(matching_terms)))
    return scores
