In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
#import tsv file as a dataframe
dataset = pd.read_csv('data/collection.tsv', sep='\t', header=None, names=["id", "text"])

#check the first 20 rows without cut text


In [6]:
# show interactive dataset
dataset.head(20)

Unnamed: 0,id,text
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...
5,5,The Manhattan Project. This once classified ph...
6,6,Nor will it attempt to substitute for the extr...
7,7,Manhattan Project. The Manhattan Project was a...
8,8,"In June 1942, the United States Army Corps of ..."
9,9,One of the main reasons Hanford was selected a...


In [18]:
#import csv as a dataframe
train_querys = pd.read_csv('data/queries_train.csv')
test_querys = pd.read_csv('data/queries_test.csv')

In [19]:
train_querys.head()

Unnamed: 0,qid,query,topic_number,turn_number
0,4_1,What was the neolithic revolution?,4,1
1,4_2,When did it start and end?,4,2
2,4_3,Why did it start?,4,3
3,4_4,What did the neolithic invent?,4,4
4,4_5,What tools were used?,4,5


In [20]:
test_querys.head()

Unnamed: 0,qid,query,topic_number,turn_number
0,1_1,What is a physician's assistant?,1,1
1,1_2,What are the educational requirements required...,1,2
2,1_3,What does it cost?,1,3
3,1_4,What's the average starting salary in the UK?,1,4
4,1_5,What about in the US?,1,5


In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing

# Downloading stopwords (just a mock step since we don't have internet access here)
#nltk.download('stopwords')
#nltk.download('punkt')

# We'll use a predefined list of English stopwords.
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Tokenize, lowercase, and remove stopwords from the text."""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return " ".join(filtered_tokens)

from joblib import Parallel, delayed

def multiprocess_preprocess_joblib(data, column_name):
    """Preprocess data using multiple processes with joblib."""
    num_processes = multiprocessing.cpu_count()
    
    # Using joblib's Parallel and delayed to run preprocessing in parallel
    processed_data = Parallel(n_jobs=num_processes)(delayed(preprocess_text)(text) for text in data[column_name])
    
    return processed_data

# Just to demonstrate, I'll use the previously loaded 'collection' dataframe as a stand-in for 'dataset'
# Since 'dataset', 'train_querys', and 'test_querys' aren't defined in this environment
dataset['processed_text'] = multiprocess_preprocess_joblib(dataset, 'text')
train_querys['processed_query'] = multiprocess_preprocess_joblib(train_querys, 'query')
test_querys['processed_query'] = multiprocess_preprocess_joblib(test_querys, 'query')

In [22]:
from rank_bm25 import BM25Okapi

# Tokenize processed passages for BM25 indexing
tokenized_passages = dataset['processed_text'].apply(lambda x: x.split()).tolist()

# Build BM25 index
bm25 = BM25Okapi(tokenized_passages)

# Test BM25 indexing with a sample query
sample_query = test_querys['processed_query'].iloc[0]
sample_results = bm25.get_top_n(sample_query.split(), dataset['text'].tolist(), n=5)

sample_results

["Compare Salaries. 1  Physician Office Assistant. 2  Physician Office Assistant II. 3  Assistant Professor/Physician.  Assistant Professor/Attending 1  Physician. Physician's Assistant.  Assistant Attending Physician.",
 "Salary for Physician Assistants. Also known as: Anesthesiologist Assistant, Certified Physician's Assistant, Family Practice Physician Assistant, Orthopaedic Physician Assistant, Orthopedic Physician Assistant, Pediatric Physician Assistant, Radiology Practitioner Assistant, Surgical Physician Assistant.",
 'PA-C stands for physician assistant - certified. Certified Physician Assistant. PA-C stands for physician assistant - certified. i do not know. PA-C stands for physician assistant - certified. Certified Physician Assistant. PA-C stands for physician assistant - certified.',
 "Salary for Anesthesiologist Assistants. Also known as: Anesthesiologist Assistant, Certified Physician's Assistant, Family Practice Physician Assistant, Orthopaedic Physician Assistant, Orth

In [25]:
from tqdm import tqdm
def generate_trec_runfile(ranking_results, run_identifier, output_file):
    """Generate a TREC runfile using the given ranking results."""
    with open(output_file, 'w') as file:
        for qid, passage_indices in ranking_results.items():
            for rank, passage_idx in enumerate(passage_indices, 1):
                # Construct the turn identifier from the qid
                topic_id, turn_id = qid.split("_")
                turn_identifier = f"{topic_id}_{turn_id}"
                
                # Retrieve the document ID from the collection using the passage index
                doc_id = dataset.iloc[passage_idx]['pid']
                
                # The BM25 score could be retrieved and used here, but for simplicity, 
                # we're using the rank as a negative score (to ensure descending order)
                score = -rank
                
                # Write the formatted line to the file
                file.write(f"{turn_identifier} Q0 {doc_id} {rank} {score} {run_identifier}\n")


# Retrieve top 1000 passages for each query in test_querys
ranking_results_new = {}
top_k = 1000
run_id_new = "BM25_integration_run"


def retrieve_rankings(row):
    qid = row['qid']
    query_text = row['processed_query']
    top_indices = bm25.get_top_n(query_text.split(), range(len(tokenized_passages)), n=top_k)
    return qid, top_indices

# Parallel retrieval of rankings
num_processes = multiprocessing.cpu_count()
ranking_results_parallel = Parallel(n_jobs=num_processes)(
    delayed(retrieve_rankings)(row) for _, row in test_querys.iterrows()
)

# Convert results to a dictionary
ranking_results_dict = {qid: indices for qid, indices in ranking_results_parallel}

# Generate the TREC runfile using the parallel results
output_filename_parallel = "/mnt/data/trec_runfile_parallel.txt"
generate_trec_runfile(ranking_results_dict, run_id_new, output_filename_parallel)

# Provide the path to the generated file
output_filename_parallel

