In [5]:
import json
import os
import csv
import pickle
import time
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import numpy as np

import nltk
from nltk.tokenize import word_tokenize

# Necessary Resources
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/itewari1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/itewari1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# Execution parameters
file_path = "tables.jsonl"
dev_file_path = "test.jsonl"
output_dir = "results"

In [7]:
################################ Chunking #################################

# Row chunking and its metadata
def chunk_row(row, row_id, table_name, columns):
    row_text = ' | '.join([f"{columns[i]['text']}: {cell['text']}" for i, cell in enumerate(row['cells']) if columns[i]['text']])
    return {
        "text": row_text,
        "metadata": {
            "table_name": table_name,
            "row_id": row_id,
            "chunk_id": f"{table_name}_row_{row_id}",
            "chunk_type": "row",
            "columns": [col["text"] for col in columns],
            "metadata_text": f"table: {table_name}, row: {row_id}, chunk_id: {table_name}_row_{row_id}, chunk_type: row, columns: {', '.join([col['text'] for col in columns if col['text']])}"
        }
    }

# Column chunk and its metadata
def chunk_column(rows, col_id, col_name, table_name):
    column_text = ' | '.join([row['cells'][col_id]['text'] for row in rows if row['cells'][col_id]['text']])

    return {
        "text": f"{col_name if col_name else ''}: {column_text}",
        "metadata": {
            "table_name": table_name,
            "col_id": col_id,
            "chunk_id": f"{table_name}_column_{col_id}",
            "chunk_type": "column",
            "metadata_text": f"table: {table_name}, col: {col_name if col_name else ''}, chunk_id: {table_name}_column_{col_id}, chunk_type: column"
        }
    }

# Table chunking with its metadata
def chunk_table(rows, table_id, columns):
    column_names = " | ".join([col['text'] for col in columns])
    table_text = '\n'.join([column_names] + [' | '.join([cell['text'] for cell in row['cells']]) for row in rows])

    return {
        "text": table_text,
        "metadata": {
            "table_name": table_id,
            "chunk_id": f"{table_id}_table",
            "chunk_type": "table",
            "columns": [col["text"] for col in columns],  # Adding column names
            "metadata_text": f"table_name: {table_id}, chunk_id: {table_id}_table, chunk_type: table, columns: {', '.join([col['text'] for col in columns])}"
        }
    }

In [8]:
######################## Processing ##################################

# Process jsonl file: chunking
def process_jsonl(file_path):

    metadata_list = []
    chunks = []
    chunk_embeddings = []
    table_chunks = []

    with open(file_path, 'r') as f:
        for line in tqdm(f):
            data = json.loads(line.strip())
            table_id = data['tableId']
            rows = data['rows']
            columns = data['columns']

            # Chunking row
            for row_id, row in enumerate(rows):
                row_chunk = chunk_row(row, row_id, table_id, columns)
                chunks.append(row_chunk)
                metadata_list.append(row_chunk["metadata"])

            # Chunking Column
            for col_id, col in enumerate(columns):
                if col["text"]:
                    col_chunk = chunk_column(rows, col_id, col["text"], table_id)
                    chunks.append(col_chunk)
                    metadata_list.append(col_chunk["metadata"])

            # Chunking table
            table_chunk = chunk_table(rows, table_id, columns)
            chunks.append(table_chunk)
            table_chunks.append(table_chunk)

    return metadata_list, chunks, table_chunks

In [9]:
#Generate Chunks
def generate_chunks():
    metadata, chunks, table_chunks = process_jsonl(file_path)
    table_chunks = sorted(table_chunks, key=lambda x: x["metadata"]["table_name"])
    return table_chunks

In [10]:
def tokenize(text):
    return word_tokenize(text.lower())

In [11]:
#Generate Tokens
def generate_token(table_chunks):
    tokenized_chunks = []
    
    for i, chunk in enumerate(tqdm(table_chunks, desc="Tokenizing Chunks", unit="chunk")):
        table_id = chunk['metadata']['table_name']
        tokenized_text = tokenize(chunk['text'] + str(chunk['metadata']))
    
        tokenized_chunks.append({
            "table_id": table_id,
            "tokenized_text": tokenized_text,
        })

    # Save the tokenized corpus to a pickle file
    with open('tokenized_table_corpus.pkl', 'wb') as f:
        pickle.dump(tokenized_chunks, f)
    
    print("Tokenized corpus saved to 'tokenized_table_corpus.pkl'")

In [12]:
#Load the table tokens
def load_token(filename):
    with open(filename, 'rb') as f:
        tokenized_chunks_from_file = pickle.load(f)
    
    print("Tokenized corpus loaded.")
    return tokenized_chunks_from_file

In [13]:
# Rank Chunks
def rank_chunks_with_bm25(tokenized_chunks, query, top_n):
    # Using BM25
    bm25 = BM25Okapi([chunk['tokenized_text'] for chunk in tokenized_chunks])
    scores = bm25.get_scores(query)

    # Sort chunks by BM25 score in descending order
    ranked_chunks = sorted(zip(scores, tokenized_chunks), reverse=True, key=lambda x: x[0])

    # Get top N chunks
    top_ranked_chunks = ranked_chunks[:top_n]
    
    return top_ranked_chunks

# Save the top N chunks to a file
def save_top_chunks(top_chunks, output_dir, output_filename):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_filename)

    with open(output_path, 'w') as f:
        json.dump(top_chunks, f, indent=2)

    print(f"Saved top chunks to {output_path}")

In [14]:
#Extract the Table IDs for the top 5000 tables 
def getTableID(ranked_chunks):
    table_ids = []
    for chunk in ranked_chunks:
        try:
            table_ids.append(chunk[1]['table_id'])
        except (IndexError, KeyError, TypeError) as e:
            print(f"Skipping malformed chunk: {e}")
    return table_ids

In [20]:
# Main script
def main(tokenized_chunks_from_file, query_details):
    i = 1
    pruning_test_data_dir = 'pruningTestData'
    for q_dict in query_details:
        
        query = q_dict['Queries']
        target_table = q_dict['Target TableID']
        target_answer = q_dict['Target Answers']
        
        
        tokenized_query = tokenize(query)
        ranked_chunks = rank_chunks_with_bm25(tokenized_chunks_from_file, tokenized_query, top_n)

        # save_top_chunks(ranked_chunks, output_dir, "top_chunks.json")

        #Put these TableIDs in a csv

        # output_file = f"query{i}_TopTables.csv"
        output_file = os.path.join(pruning_test_data_dir, f"query{i}_TopTables.csv")
        i += 1
        table_id = getTableID(ranked_chunks)

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['query', 'top tables', 'target table', 'target answer'])  # Write header
            for tid in table_id:
                writer.writerow([query, tid, target_table, target_answer])

        print(output_file, " is ready")

In [16]:
# table_chunks = generate_chunks() #Uncomment and run this if you don't have pickle file of tokens
# generate_token(table_chunks)     #Uncomment and run this if you don't have pickle file of tokens
tokenized_chunks_from_file = load_token('tokenized_table_corpus.pkl')

Tokenized corpus loaded.


In [21]:
#Load the query list
def load_all_rows_from_csv(csv_file_path):
    rows = []
    with open(csv_file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)  # Each row is a dict with keys: Queries, TableID, AnswerTexts
    return rows


In [22]:
top_n = 5000
# queries_count = 1
query_details = load_all_rows_from_csv('random_queries.csv')
# print(query_details[:2])
main(tokenized_chunks_from_file, query_details)

pruningTestData/query1_TopTables.csv  is ready
pruningTestData/query2_TopTables.csv  is ready
pruningTestData/query3_TopTables.csv  is ready
pruningTestData/query4_TopTables.csv  is ready
pruningTestData/query5_TopTables.csv  is ready
pruningTestData/query6_TopTables.csv  is ready
pruningTestData/query7_TopTables.csv  is ready
pruningTestData/query8_TopTables.csv  is ready
pruningTestData/query9_TopTables.csv  is ready
pruningTestData/query10_TopTables.csv  is ready
pruningTestData/query11_TopTables.csv  is ready
pruningTestData/query12_TopTables.csv  is ready
pruningTestData/query13_TopTables.csv  is ready
pruningTestData/query14_TopTables.csv  is ready
pruningTestData/query15_TopTables.csv  is ready
pruningTestData/query16_TopTables.csv  is ready
pruningTestData/query17_TopTables.csv  is ready
pruningTestData/query18_TopTables.csv  is ready
pruningTestData/query19_TopTables.csv  is ready
pruningTestData/query20_TopTables.csv  is ready
pruningTestData/query21_TopTables.csv  is ready
p