In [17]:
import pandas as pd
from nltk.tokenize import word_tokenize
import time
import os
def load_data(file_path):
    file_extension = os.path.splitext(file_path)[1]
    if file_extension == '.csv':
        return pd.read_csv(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format.")
def lowercase_all(df):
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df
def tokenize_text(text):
    return set(word_tokenize(text)) if isinstance(text, str) else set()
def compute_jaccard_similarity(query_tokens, entry_tokens):
    intersection = query_tokens.intersection(entry_tokens)
    union = query_tokens.union(entry_tokens)
    return len(intersection) / len(union) if union else 0
def calculate_jaccard_distances(df, query):
    query_tokens = tokenize_text(query)
    results = []
    for index, row in df.iterrows():
        row_similarities = []
        for column in df.columns:
            entry = row[column]
            entry_tokens = tokenize_text(entry)
            similarity = compute_jaccard_similarity(query_tokens, entry_tokens)
            row_similarities.append((column, similarity))
        max_similarity = max(row_similarities, key=lambda x: x[1])[1]
        results.append((index, max_similarity))
    return results
def find_best_match(distances):
    best_match = max(distances, key=lambda x: x[1], default=None)
    return best_match
def process_data(file_path, query):
    df = load_data(file_path)
    df = lowercase_all(df)
    distances = calculate_jaccard_distances(df, query)
    best_match = find_best_match(distances)
    return best_match, df
def main():
    file_path = input("Enter the path to the file: ")
    user_query = input("Enter the query to match: ")
    start_time = time.time()
    best_match, df = process_data(file_path, user_query)
    if best_match is not None:
        index, similarity = best_match
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Best Match Found at index:", index)
        print("Jaccard Similarity:", similarity)
        print(df.iloc[index])
    else:
        print("File path:", file_path)
        print("Query given:", user_query)
        print("No match found")
    end_time = time.time()
    runtime = end_time - start_time
    print(f"Runtime: {runtime} seconds")
if __name__ == "__main__":
    main()

File path: /Users/pranavvenkata/Downloads/table.csv
Query given: grasping land
Best Match Found at index: 1102
Jaccard Similarity: 0.5
title                                                         land
author                                            simon winchester
category                                              anthropology
publish_year                               first published in 2021
title_id                                        /works/ol21213126w
author_id                      /authors/ol220363a/simon_winchester
cover_url        //covers.openlibrary.org/b/olid/ol28727176m-m.jpg
book_stats       publish date jan 18, 2022|publisher  harper pe...
descriptions     this edition doesn't have a description yet. c...
reading_stats       2 want to read|0 currently reading|0 have read
Name: 1102, dtype: object
Runtime: 24.860997200012207 seconds
