In [37]:
import pandas as pd
import Levenshtein as lev
from nltk.tokenize import word_tokenize
import time
import os
def load_data(file_path):
    file_extension = os.path.splitext(file_path)[1]
    if file_extension == '.csv':
        return pd.read_csv(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format.")
def lowercase_all(df):
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df
def tokenize_text(text):
    return word_tokenize(text) if isinstance(text, str) else []
def compute_levenshtein_distance(query_tokens, entry_tokens):
    distances = []
    for query_token in query_tokens:
        for entry_token in entry_tokens:
            distance = lev.distance(query_token, entry_token)
            distances.append(distance)
    return min(distances) if distances else float('inf')
def calculate_distances(df, query):
    query_tokens = tokenize_text(query)
    results = []
    for index, row in df.iterrows():
        row_distances = []
        for column in df.columns:
            entry = row[column]
            entry_tokens = tokenize_text(entry)
            distance = compute_levenshtein_distance(query_tokens, entry_tokens)
            row_distances.append((column, distance))
        min_distance = min(row_distances, key=lambda x: x[1])[1]
        results.append((index, min_distance))
    return results
def find_best_match(distances):
    best_match = min(distances, key=lambda x: x[1], default=None)
    return best_match
def process_data(file_path, query):
    df = load_data(file_path)
    df = lowercase_all(df)
    distances = calculate_distances(df, query)
    best_match = find_best_match(distances)
    return best_match, df
def main():
    file_path = input("Enter the path to the file: ")
    user_query = input("Enter the query to match: ")
    start_time = time.time()
    best_match, df = process_data(file_path, user_query)
    print("File path:", file_path)
    print("Query given:", user_query)
    if best_match is not None:
        index, distance = best_match
        print("Best Match Found at index:", index)
        print(df.iloc[index])
    else:
        print("No match found")
    end_time = time.time()  
    runtime = end_time - start_time
    print(f"Runtime: {runtime} seconds")
if __name__ == "__main__":
    main()


File path: /Users/pranavvenkata/Downloads/table.csv
Query given: netherlands black
Best Match Found at index: 6
title                                 black man in the netherlands
author                                          francio guadeloupe
category                                              anthropology
publish_year                               first published in 2022
title_id                                        /works/ol25915217w
author_id                   /authors/ol8236241a/francio_guadeloupe
cover_url        //covers.openlibrary.org/b/olid/ol34977443m-m.jpg
book_stats       publish date 2022|publisher  university press ...
descriptions     this edition doesn't have a description yet. c...
reading_stats       0 want to read|0 currently reading|0 have read
Name: 6, dtype: object
Runtime: 23.755329847335815 seconds
