In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing
import os.path

# Downloading stopwords (just a mock step since we don't have internet access here)
#nltk.download('stopwords')
#nltk.download('punkt')

In [None]:
from joblib import Parallel, delayed

def preprocess_text(text):
    """Tokenize, lowercase, and remove stopwords from the text."""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return " ".join(filtered_tokens)


def multiprocess_preprocess_joblib(data, column_name):
    """Preprocess data using multiple processes with joblib."""
    num_processes = multiprocessing.cpu_count()
    
    # Using joblib's Parallel and delayed to run preprocessing in parallel
    processed_data = Parallel(n_jobs=num_processes)(delayed(preprocess_text)(text) for text in data[column_name])
    
    return processed_data

# We'll use a predefined list of English stopwords.
stop_words = set(stopwords.words('english'))


# if data/preprocessed_collection.tsv does not exist, create it
if not os.path.isfile('data/preprocessed_collection.tsv'):
    dataset = pd.read_csv('data/collection.tsv', sep='\t')
    dataset['processed_text'] = multiprocess_preprocess_joblib(dataset, 'text')
    dataset = dataset.drop(columns=['text'])
    dataset = dataset.rename(columns={'processed_text': 'text'})
    dataset.to_csv('data/preprocessed_collection.tsv', sep='\t', index=False)
else:
    dataset = pd.read_csv('data/preprocessed_collection.tsv', sep='\t')
    dataset = dataset.dropna()

if not os.path.isfile('data/preprocessed_querys_train.tsv'):
    train_querys = pd.read_csv('data/train.tsv', sep='\t')
    train_querys['processed_query'] = multiprocess_preprocess_joblib(train_querys, 'query')
    train_querys = train_querys.drop(columns=['query'])
    train_querys = train_querys.rename(columns={'processed_query': 'query'})
    train_querys.to_csv('data/preprocessed_queries_train.csv', index=False)
else: 
    train_querys = pd.read_csv('data/preprocessed_queries_train.csv')


if not os.path.isfile('data/preprocessed_querys_test.tsv'):
    test_querys = pd.read_csv('data/test.tsv', sep='\t')
    test_querys['processed_query'] = multiprocess_preprocess_joblib(test_querys, 'query')
    test_querys = test_querys.drop(columns=['query'])
    test_querys = test_querys.rename(columns={'processed_query': 'query'})
    test_querys.to_csv('data/preprocessed_queries_test.csv', index=False)
else:
    test_querys = pd.read_csv('data/preprocessed_queries_test.csv')


In [15]:
from rank_bm25 import BM25Okapi
import pickle

tokenized_passages = dataset['text'].apply(lambda x: x.split()).tolist()

# if piccke index is not present load it
if not os.path.isfile('data/bm25_index.pickle'):
    print("Index data not present, creating...")
    bm25 = BM25Okapi(tokenized_passages)
    with open('data/bm25_index.pickle', 'wb') as handle:
        pickle.dump(bm25, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open('data/bm25_index.pkl', 'rb') as f:
        bm25 = pickle.load(f)

In [None]:
from tqdm import tqdm
def generate_trec_runfile(ranking_results, run_identifier, output_file):
    """Generate a TREC runfile using the given ranking results."""
    with open(output_file, 'w') as file:
        for qid, passage_indices in ranking_results.items():
            for rank, passage_idx in enumerate(passage_indices, 1):
                # Construct the turn identifier from the qid
                topic_id, turn_id = qid.split("_")
                turn_identifier = f"{topic_id}_{turn_id}"
                
                # Retrieve the document ID from the collection using the passage index
                doc_id = dataset.iloc[passage_idx]['pid']
                
                # The BM25 score could be retrieved and used here, but for simplicity, 
                # we're using the rank as a negative score (to ensure descending order)
                score = -rank
                
                # Write the formatted line to the file
                file.write(f"{turn_identifier} Q0 {doc_id} {rank} {score} {run_identifier}\n")


# Retrieve top 1000 passages for each query in test_querys
ranking_results_new = {}
top_k = 1000
run_id_new = "BM25_integration_run"


def retrieve_rankings(row):
    qid = row['qid']
    query_text = row['query']
    top_indices = bm25.get_top_n(query_text.split(), range(len(tokenized_passages)), n=top_k)
    print(f"Retrieved all passages for query {qid}")
    return qid, top_indices
from tqdm import tqdm

ranking_results_dict = {}
for _, row in tqdm(test_querys.iterrows(), total=test_querys.shape[0]):
    qid, top_indices = retrieve_rankings(row)
    ranking_results_dict[qid] = top_indices

# Generate the TREC runfile using the results
output_filename_parallel = "/data/trec_runfile_parallel.txt"
generate_trec_runfile(ranking_results_dict, run_id_new, output_filename_parallel)

output_filename_parallel


  0%|          | 1/248 [00:27<1:51:19, 27.04s/it]

Retrieved all passages for query 1_1


  0%|          | 1/248 [01:22<5:39:08, 82.38s/it]


KeyboardInterrupt: 