In [68]:
import csv
import xml.etree.ElementTree as ET

def parse_trectext_file(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    documents = []

    for doc_elem in root.findall('.//DOC'): 
        document = {}
        document['docno'] = doc_elem.find('DOCNO').text.strip() if doc_elem.find('DOCNO') is not None else ''
        document['text'] = doc_elem.find('TEXT').text.strip() if doc_elem.find('TEXT') is not None else ''
        documents.append(document)

    return documents
def write_to_csv(documents, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['docno', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for document in documents:
            writer.writerow(document)

file_path = './PsgRobust/doc.trectext'
csv_file = './PsgRobust/documents.csv'

documents = parse_trectext_file(file_path)
write_to_csv(documents, csv_file)

In [69]:
import csv
import pandas as pd

def read_csv(csv_file):
    df = pd.read_csv(csv_file)
    return df

def write_to_csv(documents, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['docno', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for document in documents:
            writer.writerow(document)

def get_docno(docno):
    return docno.split('-')[0]

def get_doc_text(documents):
    doc_text = ''
    for document in documents:
        doc_text += document['text']
    return doc_text

def get_docno_text_dict(documents):
    docno_text_dict = {}
    for x,document in documents.iterrows():
        docno = get_docno(document['docno'])
        if docno not in docno_text_dict:
            docno_text_dict[docno] = []
        docno_text_dict[docno].append(document['text'])
    return docno_text_dict

csv_file = './PSGRobust/documents.csv'
documents = read_csv(csv_file)
docno_text_dict = get_docno_text_dict(documents)

In [70]:
for docno in docno_text_dict.keys():
    docno_text_dict[docno] = ' '.join(docno_text_dict[docno])


df = pd.DataFrame.from_dict(docno_text_dict, orient='index', columns=['text'])
df = df.reset_index()
df.columns = ['docno', 'text']
df.to_csv('./PsgRobust/documents_passage.csv', index=False)

In [71]:
docno_text_dict.keys()

dict_keys(['FT922', 'FR940406', 'LA073089', 'FBIS4', 'FR941007', 'FR940328', 'FT921', 'LA050289', 'LA041889', 'FBIS3', 'FT934', 'FT932', 'LA031889', 'FR940819', 'FT942', 'LA090690', 'LA110790', 'FT943', 'LA121089', 'LA060490', 'LA012390', 'FT933', 'LA022390', 'LA120789', 'LA072790', 'FR940513', 'LA070490', 'FR940419', 'FT941', 'FR940325', 'LA100790', 'FT931', 'FT924', 'LA063090', 'LA053090', 'LA101290', 'LA100589', 'LA122889', 'FT923', 'FT944', 'LA102890', 'LA030290', 'LA060289', 'LA080489', 'LA040290', 'LA082589', 'LA062589', 'LA043090', 'LA121290', 'LA042389', 'LA102689', 'LA011890', 'FR940627', 'LA021689', 'LA111290', 'LA070289', 'LA090489', 'FR941013', 'LA021890', 'LA112889', 'LA102889', 'LA091190', 'FR940822', 'LA092790', 'LA032389', 'LA072589', 'FR940204', 'LA083089', 'LA052389', 'LA021889', 'LA032390', 'LA110589', 'FR940110', 'LA101089', 'LA080690', 'LA052590', 'FR940224', 'FR940705', 'LA012189', 'LA120790', 'FR940429', 'FR940630', 'FR940916', 'LA111089', 'FR940318', 'FR940711',

In [31]:
# iterate the dict and generate the documents
def generate_documents(docno_text_dict):
    documents = []
    for docno in docno_text_dict.keys():
        document = {}
        document['docno'] = docno
        document['text'] = docno_text_dict[docno]
        documents.append(document)
    return documents
documents = generate_documents(docno_text_dict)

In [32]:
from gensim import corpora
documents = read_csv('./PsgRobust/documents_passage.csv')
texts = [[word for word in document.lower().split()] for document in documents['text']]
dictionary = corpora.Dictionary(texts)
dictionary.save('./PsgRobust/documents_passage.dict')

In [33]:
import string
def remove_punctuation_and_numbers(input_text):
    exclude = set(string.punctuation + string.digits)
    cleaned_text = ''.join(char for char in input_text if char not in exclude)
    return cleaned_text

def remove_specialcharacters(text):
    no_special = "".join([c for c in text if c not in string.punctuation])
    return no_special

def clean_text(text):
    text = text.lower()
    text = remove_punctuation_and_numbers(text)
    text = remove_specialcharacters(text)
    return text

In [34]:
# Build a answer passage retrieval system using BM25
import csv
import pandas as pd 
import numpy as np

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
documents['text']

0      920530 ft 30 may 92 the long view going for th...
1      site treatment plan the savannah river site wa...
2      july 30 1989 sunday home edition a rap flap am...
3      bfn silvia aloisi report al qadhdhafis secret ...
4      h stephen cranston professional corporation pe...
                             ...                        
899    july 19 1989 wednesday home edition fish repor...
900    november 17 1990 saturday orange county editio...
901    august 27 1990 monday home edition in brief sc...
902    may 30 1989 tuesday p m final u s supreme cour...
903    july 16 1990 monday home edition science medic...
Name: text, Length: 904, dtype: object

In [36]:
# Stop Word Removal
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

documents['text'] = documents['text'].apply(lambda x: remove_stopwords(x.split()))
documents['text']

0      920530 ft 30 may 92 long view going jugular wh...
1      site treatment plan savannah river site waste ...
2      july 30 1989 sunday home edition rap flap amid...
3      bfn silvia aloisi report al qadhdhafis secret ...
4      h stephen cranston professional corporation pe...
                             ...                        
899    july 19 1989 wednesday home edition fish repor...
900    november 17 1990 saturday orange county editio...
901    august 27 1990 monday home edition brief scien...
902    may 30 1989 tuesday p final u supreme court up...
903    july 16 1990 monday home edition science medic...
Name: text, Length: 904, dtype: object

In [37]:
# Tokenization 
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

tokenized_docs= documents['text'].apply(lambda x: tokenize(x))
tokenized_docs

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pranavdeepak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0      [920530, ft, 30, may, 92, long, view, going, j...
1      [site, treatment, plan, savannah, river, site,...
2      [july, 30, 1989, sunday, home, edition, rap, f...
3      [bfn, silvia, aloisi, report, al, qadhdhafis, ...
4      [h, stephen, cranston, professional, corporati...
                             ...                        
899    [july, 19, 1989, wednesday, home, edition, fis...
900    [november, 17, 1990, saturday, orange, county,...
901    [august, 27, 1990, monday, home, edition, brie...
902    [may, 30, 1989, tuesday, p, final, u, supreme,...
903    [july, 16, 1990, monday, home, edition, scienc...
Name: text, Length: 904, dtype: object

In [38]:
# Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

lemmatized_text = documents['text'][:1000].apply(lambda x: lemmatize_text(x))
lemmatized_text

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pranavdeepak/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0      9 2 0 5 3 0   f t   3 0   m a y   9 2   l o n ...
1      s i t e   t r e a t m e n t   p l a n   s a v ...
2      j u l y   3 0   1 9 8 9   s u n d a y   h o m ...
3      b f n   s i l v i a   a l o i s i   r e p o r ...
4      h   s t e p h e n   c r a n s t o n   p r o f ...
                             ...                        
899    j u l y   1 9   1 9 8 9   w e d n e s d a y   ...
900    n o v e m b e r   1 7   1 9 9 0   s a t u r d ...
901    a u g u s t   2 7   1 9 9 0   m o n d a y   h ...
902    m a y   3 0   1 9 8 9   t u e s d a y   p   f ...
903    j u l y   1 6   1 9 9 0   m o n d a y   h o m ...
Name: text, Length: 904, dtype: object

In [39]:
# Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_text(text):
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

stemmed_text = documents['text'].apply(lambda x: stem_text(x))
stemmed_text


0      9 2 0 5 3 0   f t   3 0   m a y   9 2   l o n ...
1      s i t e   t r e a t m e n t   p l a n   s a v ...
2      j u l y   3 0   1 9 8 9   s u n d a y   h o m ...
3      b f n   s i l v i a   a l o i s i   r e p o r ...
4      h   s t e p h e n   c r a n s t o n   p r o f ...
                             ...                        
899    j u l y   1 9   1 9 8 9   w e d n e s d a y   ...
900    n o v e m b e r   1 7   1 9 9 0   s a t u r d ...
901    a u g u s t   2 7   1 9 9 0   m o n d a y   h ...
902    m a y   3 0   1 9 8 9   t u e s d a y   p   f ...
903    j u l y   1 6   1 9 9 0   m o n d a y   h o m ...
Name: text, Length: 904, dtype: object

In [40]:
documents['text'][0]



In [41]:
def answer_passage_retrieval(query, documents, top_n):
    query = clean_text(query)
    documents['text'] = documents['text'].apply(clean_text)
    tokenized_corpus = [doc.split(" ") for doc in documents['text']]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    doc_scores = list(enumerate(doc_scores))
    doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
    doc_scores = doc_scores[:top_n]
    print(doc_scores)
    doc_ids = [str(doc_score[0]) for doc_score in doc_scores]
    doc_scores = [str(doc_score[1]) for doc_score in doc_scores]
    return doc_ids, doc_scores

In [42]:
answer_passage_retrieval('where are wind power installations located', documents, 10)

[(9, 9.53555934633467), (462, 9.42540399499724), (250, 9.387272482414886), (443, 9.10307420833515), (226, 8.84764609438139), (154, 8.819093797194354), (94, 8.793966421190149), (331, 8.755741275152545), (678, 8.554577531600085), (391, 8.344485626278747)]


(['9', '462', '250', '443', '226', '154', '94', '331', '678', '391'],
 ['9.53555934633467',
  '9.42540399499724',
  '9.387272482414886',
  '9.10307420833515',
  '8.84764609438139',
  '8.819093797194354',
  '8.793966421190149',
  '8.755741275152545',
  '8.554577531600085',
  '8.344485626278747'])

In [43]:
# QL model for answer passage retrieval
import csv
import pandas as pd
import numpy as np
import math

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [44]:
def answer_passage_retrieval_ql(query, documents, top_n):
    query = clean_text(query)
    documents['text'] = documents['text'].apply(clean_text)
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(documents['text']))
    tfidf = tfidf.toarray()
    query_vec = vectorizer.transform([query]).toarray()
    scores = np.dot(query_vec, tfidf.T)
    scores = scores[0]
    scores = list(enumerate(scores))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[:top_n]
    doc_ids = [str(score[0]) for score in scores]
    scores = [str(score[1]) for score in scores]
    return doc_ids, scores


In [45]:
documents

Unnamed: 0,docno,text
0,FT922,ft may long view going jugular whenever mee...
1,FR940406,site treatment plan savannah river site waste ...
2,LA073089,july sunday home edition rap flap amid anti ...
3,FBIS4,bfn silvia aloisi report al qadhdhafis secret ...
4,FR941007,h stephen cranston professional corporation pe...
...,...,...
899,LA071989,july wednesday home edition fish report two ...
900,LA111790,november saturday orange county edition ask ...
901,LA082790,august monday home edition brief science med...
902,LA053089,may tuesday p final u supreme court upholds ...


In [46]:
answer_passage_retrieval_ql('where are wind power installations located', documents, 10)

(['185', '415', '529', '565', '579', '241', '604', '16', '649', '456'],
 ['0.19719898346036596',
  '0.18479491396431105',
  '0.14969010475407876',
  '0.14869523395510742',
  '0.14444174094801004',
  '0.13290731170378747',
  '0.12274053574414193',
  '0.11043967622152376',
  '0.10466725175049657',
  '0.0995867365218894'])

In [47]:
documents

Unnamed: 0,docno,text
0,FT922,ft may long view going jugular whenever mee...
1,FR940406,site treatment plan savannah river site waste ...
2,LA073089,july sunday home edition rap flap amid anti ...
3,FBIS4,bfn silvia aloisi report al qadhdhafis secret ...
4,FR941007,h stephen cranston professional corporation pe...
...,...,...
899,LA071989,july wednesday home edition fish report two ...
900,LA111790,november saturday orange county edition ask ...
901,LA082790,august monday home edition brief science med...
902,LA053089,may tuesday p final u supreme court upholds ...


In [48]:
import re
from collections import Counter
corpus = []
for i, document in documents.iterrows():
    corpus.append({"id": i, "text": document['text']})
len(corpus)
# Store the corpus in a file
import json
with open('./PsgRobust/corpus.json', 'w') as fout:
    json.dump(corpus, fout)

# Load the corpus from the file
import json
with open('./PsgRobust/corpus.json', 'r') as fin:
    corpus = json.load(fin)