In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#import tsv file as a dataframe
dataset = pd.read_csv('data/collection.tsv', sep='\t', header=None, names=["id", "text"])

#check the first 20 rows without cut text


    id                                               text
0    0  The presence of communication amid scientific ...
1    1  The Manhattan Project and its atomic bomb help...
2    2  Essay on The Manhattan Project - The Manhattan...
3    3  The Manhattan Project was the name for a proje...
4    4  versions of each volume as well as complementa...
5    5  The Manhattan Project. This once classified ph...
6    6  Nor will it attempt to substitute for the extr...
7    7  Manhattan Project. The Manhattan Project was a...
8    8  In June 1942, the United States Army Corps of ...
9    9  One of the main reasons Hanford was selected a...
10  10  group discussions, community boards or panels ...
11  11  punishment designed to repair the damage done ...
12  12  Tutorial: Introduction to Restorative Justice....
13  13  Organize volunteer community panels, boards, o...
14  14  The purpose of this paper is to point out a nu...
15  15  Each of these types of communitiesâthe geogr...
16  16  The ap

In [6]:
# show interactive dataset
dataset.head(20)

Unnamed: 0,id,text
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...
5,5,The Manhattan Project. This once classified ph...
6,6,Nor will it attempt to substitute for the extr...
7,7,Manhattan Project. The Manhattan Project was a...
8,8,"In June 1942, the United States Army Corps of ..."
9,9,One of the main reasons Hanford was selected a...


In [13]:
#import csv as a dataframe
train_querys = pd.read_csv('data/queries_train.csv')
test_querys = pd.read_csv('data/queries_test.csv')

In [14]:
train_querys.head()

Unnamed: 0,qid,query,topic_number,turn_number
0,4_1,What was the neolithic revolution?,4,1
1,4_2,When did it start and end?,4,2
2,4_3,Why did it start?,4,3
3,4_4,What did the neolithic invent?,4,4
4,4_5,What tools were used?,4,5


In [15]:
test_querys.head()

Unnamed: 0,qid,query,topic_number,turn_number
0,1_1,What is a physician's assistant?,1,1
1,1_2,What are the educational requirements required...,1,2
2,1_3,What does it cost?,1,3
3,1_4,What's the average starting salary in the UK?,1,4
4,1_5,What about in the US?,1,5


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing

# Downloading stopwords (just a mock step since we don't have internet access here)
#nltk.download('stopwords')
#nltk.download('punkt')

# We'll use a predefined list of English stopwords.
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Tokenize, lowercase, and remove stopwords from the text."""
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return " ".join(filtered_tokens)

from joblib import Parallel, delayed

def multiprocess_preprocess_joblib(data, column_name):
    """Preprocess data using multiple processes with joblib."""
    num_processes = multiprocessing.cpu_count()
    
    # Using joblib's Parallel and delayed to run preprocessing in parallel
    processed_data = Parallel(n_jobs=num_processes)(delayed(preprocess_text)(text) for text in data[column_name])
    
    return processed_data

# Just to demonstrate, I'll use the previously loaded 'collection' dataframe as a stand-in for 'dataset'
# Since 'dataset', 'train_querys', and 'test_querys' aren't defined in this environment
dataset['processed_text'] = multiprocess_preprocess_joblib(dataset, 'text')
train_querys['processed_query'] = multiprocess_preprocess_joblib(train_querys, 'query')
test_querys['processed_query'] = multiprocess_preprocess_joblib(test_querys, 'query')

# For display purposes
dataset.head(), train_querys.head(), test_querys.head()

(   id                                               text  \
 0   0  The presence of communication amid scientific ...   
 1   1  The Manhattan Project and its atomic bomb help...   
 2   2  Essay on The Manhattan Project - The Manhattan...   
 3   3  The Manhattan Project was the name for a proje...   
 4   4  versions of each volume as well as complementa...   
 
                                       processed_text  
 0  presence communication amid scientific minds e...  
 1  manhattan project atomic bomb helped bring end...  
 2  essay manhattan project manhattan project manh...  
 3  manhattan project name project conducted world...  
 4  versions volume well complementary websites fi...  ,
    qid                               query  topic_number  turn_number  \
 0  4_1  What was the neolithic revolution?             4            1   
 1  4_2          When did it start and end?             4            2   
 2  4_3                   Why did it start?             4            3   

In [20]:
import sys
!{sys.executable} -m pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [21]:
from rank_bm25 import BM25Okapi

# Tokenize processed passages for BM25 indexing
tokenized_passages = dataset['processed_text'].apply(lambda x: x.split()).tolist()

# Build BM25 index
bm25 = BM25Okapi(tokenized_passages)

# Test BM25 indexing with a sample query
sample_query = test_querys['processed_query'].iloc[0]
sample_results = bm25.get_top_n(sample_query.split(), dataset['text'].tolist(), n=5)

sample_results