In [39]:
import json
import os
import csv
import pickle
import time
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import numpy as np

import nltk
from nltk.tokenize import word_tokenize

# Necessary Resources
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text.lower())

[nltk_data] Downloading package punkt to /home/itewari1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/itewari1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
# Execution parameters
file_path = "tables.jsonl"
dev_file_path = "test.jsonl"
output_dir = "results"

In [51]:
#Load the table tokens
with open('tokenized_table_corpus.pkl', 'rb') as f:
    tokenized_chunks_from_file = pickle.load(f)

print("Tokenized corpus loaded.")

Tokenized corpus loaded.


In [4]:
################################ Chunking #################################

# Row chunking and its metadata
def chunk_row(row, row_id, table_name, columns):
    row_text = ' | '.join([f"{columns[i]['text']}: {cell['text']}" for i, cell in enumerate(row['cells']) if columns[i]['text']])
    return {
        "text": row_text,
        "metadata": {
            "table_name": table_name,
            "row_id": row_id,
            "chunk_id": f"{table_name}_row_{row_id}",
            "chunk_type": "row",
            "columns": [col["text"] for col in columns],
            "metadata_text": f"table: {table_name}, row: {row_id}, chunk_id: {table_name}_row_{row_id}, chunk_type: row, columns: {', '.join([col['text'] for col in columns if col['text']])}"
        }
    }

# Column chunk and its metadata
def chunk_column(rows, col_id, col_name, table_name):
    column_text = ' | '.join([row['cells'][col_id]['text'] for row in rows if row['cells'][col_id]['text']])

    return {
        "text": f"{col_name if col_name else ''}: {column_text}",
        "metadata": {
            "table_name": table_name,
            "col_id": col_id,
            "chunk_id": f"{table_name}_column_{col_id}",
            "chunk_type": "column",
            "metadata_text": f"table: {table_name}, col: {col_name if col_name else ''}, chunk_id: {table_name}_column_{col_id}, chunk_type: column"
        }
    }

# Table chunking with its metadata
def chunk_table(rows, table_id, columns):
    column_names = " | ".join([col['text'] for col in columns])
    table_text = '\n'.join([column_names] + [' | '.join([cell['text'] for cell in row['cells']]) for row in rows])

    return {
        "text": table_text,
        "metadata": {
            "table_name": table_id,
            "chunk_id": f"{table_id}_table",
            "chunk_type": "table",
            "columns": [col["text"] for col in columns],  # Adding column names
            "metadata_text": f"table_name: {table_id}, chunk_id: {table_id}_table, chunk_type: table, columns: {', '.join([col['text'] for col in columns])}"
        }
    }

In [5]:
######################## Processing ##################################

# Process jsonl file: chunking
def process_jsonl(file_path):

    metadata_list = []
    chunks = []
    chunk_embeddings = []
    table_chunks = []

    with open(file_path, 'r') as f:
        for line in tqdm(f):
            data = json.loads(line.strip())
            table_id = data['tableId']
            rows = data['rows']
            columns = data['columns']

            # Chunking row
            for row_id, row in enumerate(rows):
                row_chunk = chunk_row(row, row_id, table_id, columns)
                chunks.append(row_chunk)
                metadata_list.append(row_chunk["metadata"])

            # Chunking Column
            for col_id, col in enumerate(columns):
                if col["text"]:
                    col_chunk = chunk_column(rows, col_id, col["text"], table_id)
                    chunks.append(col_chunk)
                    metadata_list.append(col_chunk["metadata"])

            # Chunking table
            table_chunk = chunk_table(rows, table_id, columns)
            chunks.append(table_chunk)
            table_chunks.append(table_chunk)

    return metadata_list, chunks, table_chunks

In [6]:
# Rank Chunks
def rank_chunks_with_bm25(tokenized_chunks, query, top_n):
    # Using BM25
    bm25 = BM25Okapi([chunk['tokenized_text'] for chunk in tokenized_chunks])
    scores = bm25.get_scores(query)

    # Sort chunks by BM25 score in descending order
    ranked_chunks = sorted(zip(scores, tokenized_chunks), reverse=True, key=lambda x: x[0])

    # Get top N chunks
    top_ranked_chunks = ranked_chunks[:top_n]
    
    return top_ranked_chunks

# Save the top N chunks to a file
def save_top_chunks(top_chunks, output_dir, output_filename):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)

    with open(output_path, 'w') as f:
        json.dump(top_chunks, f, indent=2)

    print(f"Saved top chunks to {output_path}")

# Calculate recall, rank, and 10%/20% check
def calculate_recall(ranked_chunks, correct_table_id, top_n):
    rank = None
    for idx, (_, chunk) in enumerate(ranked_chunks):
        if chunk['table_id'] == correct_table_id:
            rank = idx + 1

            # Check if it's in the top 10% or 20%
            is_in_top_10 = 1 if rank <= top_n * 0.1 else 0
            is_in_top_20 = 1 if rank <= top_n * 0.2 else 0
            return 1, rank, is_in_top_10, is_in_top_20

    return 0, None, 0, 0  # Relevant item not found

In [44]:
# def getTableID():
#     with open('results/top_chunks.json', 'r') as top_chunks_file:
#         for i, line in enumerate(tqdm(top_chunks_file)):
#             data = json.loads(line.strip())
#             print(data[1]['table_id'])

# getTableID()

 

# def getTableID(ranked_chunks):
#     table_id = []
    # with open('results/top_chunks.json', 'r') as f:
        #     all_data = json.load(f)  # load entire JSON array
    
        # for i, item in enumerate(tqdm(all_data)):
        #     try:
        #         table_id.append(item[1]['table_id'])
        #         # print(table_id)
        #     except (IndexError, KeyError, TypeError) as e:
        #         print(f"Skipping item {i}: {e}")
    
        # # print(table_id)

#     for ranked_chunk in ranked_chunks:
#         table_id.append(ranked_chunk[1]['table_id'])
        
#     return table_id


def getTableID(ranked_chunks):
    table_ids = []
    for chunk in ranked_chunks:
        try:
            table_ids.append(chunk[1]['table_id'])
        except (IndexError, KeyError, TypeError) as e:
            print(f"Skipping malformed chunk: {e}")
    return table_ids

# getTableID()

In [52]:
# Main script
def main():
    # Process jsonl file and run BM25 on each queries_count
    with open(dev_file_path, 'r') as dev_file:
        for i, line in enumerate(tqdm(dev_file)):
            if i >= queries_count:  
                break
            
            data = json.loads(line.strip())
            query = data['questions'][0]['originalText']
            correct_table_id = data['table']['tableId']

            tokenized_query = tokenize(query)
            ranked_chunks = rank_chunks_with_bm25(tokenized_chunks_from_file, tokenized_query, top_n)

            # print(ranked_chunks)

            save_top_chunks(ranked_chunks, output_dir, "top_chunks.json")

            # time.sleep(60)

            #Put these TableIDs in a csv

            output_file = f"query{i}_TopTables.csv"
            table_id = getTableID(ranked_chunks)

            # print(table_id)

            with open(output_file, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
            
                # Write header
                writer.writerow(['query', 'table_id'])
                
            
                # Write each (query, table_id) pair
                for tid in table_id:
                    writer.writerow([query, tid])


In [46]:
# #Generate Chunks
# metadata, chunks, table_chunks = process_jsonl(file_path)
# table_chunks = sorted(table_chunks, key=lambda x: x["metadata"]["table_name"])

In [47]:
# #Generate Tokens
# tokenized_chunks = []

# for i, chunk in enumerate(tqdm(table_chunks, desc="Tokenizing Chunks", unit="chunk")):
#     table_id = chunk['metadata']['table_name']
#     tokenized_text = tokenize(chunk['text'] + str(chunk['metadata']))

#     tokenized_chunks.append({
#         "table_id": table_id,
#         "tokenized_text": tokenized_text,
#     })

In [48]:
# # Save the tokenized corpus to a pickle file
# with open('tokenized_table_corpus.pkl', 'wb') as f:
#     pickle.dump(tokenized_chunks, f)

# print("Tokenized corpus saved to 'tokenized_table_corpus.pkl'")

In [53]:
top_n = 5000
queries_count = 1
main()

1it [00:17, 17.13s/it]

Saved top chunks to results/top_chunks.json



