In [8]:
import pandas as pd
import numpy as np
import os
import time
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from pyxdameraulevenshtein import damerau_levenshtein_distance
def load_data(file_path):
    file_extension = os.path.splitext(file_path)[1]
    if file_extension == '.csv':
        return pd.read_csv(file_path).head(100)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path).head(100)
    else:
        raise ValueError("Unsupported file format.")
def lowercase_all(df):
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df
def generate_embeddings(model, df):
    embeddings = {}
    for column in df.columns:
        column_embeddings = model.encode(df[column].astype(str).tolist(), convert_to_tensor=True)
        embeddings[column] = column_embeddings.cpu().numpy()
    return embeddings
def compute_damerau_levenshtein_similarity(query, entry):
    max_len = max(len(query), len(entry))
    if max_len == 0:
        return 1.0
    distance = damerau_levenshtein_distance(query, entry)
    return 1 - (distance / max_len)
def calculate_semantic_similarities(df, query, model):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    embeddings = generate_embeddings(model, df)
    results = []
    for index, row in df.iterrows():
        row_similarities = []
        for column in df.columns:
            entry = str(row[column])
            entry_embedding = embeddings[column][index].reshape(1, -1)
            semantic_similarity = np.dot(entry_embedding, query_embedding.T).max()
            damerau_levenshtein_similarity = compute_damerau_levenshtein_similarity(query, entry)
            combined_similarity = (semantic_similarity + damerau_levenshtein_similarity) / 2
            row_similarities.append((column, combined_similarity))
        max_similarity = max(row_similarities, key=lambda x: x[1])[1]
        if max_similarity >= 0.5:
            results.append((index, max_similarity))
    return results
def find_best_match(similarities):
    best_match = max(similarities, key=lambda x: x[1], default=None)
    return best_match
def compute_metrics(df, best_match):
    y_true = df['ground_truth'].tolist()
    y_pred = [1 if i == best_match[0] else 0 for i in range(len(df))]
    report = classification_report(y_true, y_pred, output_dict=True)
    return pd.DataFrame(report).transpose()
def process_data(file_path, query):
    df = load_data(file_path)
    df = lowercase_all(df)
    df['ground_truth'] = [1 if i == 0 else 0 for i in range(len(df))]
    model = SentenceTransformer('all-MiniLM-L6-v2', tokenizer_kwargs={'clean_up_tokenization_spaces': True})
    similarities = calculate_semantic_similarities(df, query, model)
    best_match = find_best_match(similarities)
    metrics = compute_metrics(df, best_match) if best_match is not None else pd.DataFrame()
    return best_match, df, metrics
def main():
    file_path = input("Enter the path to the file: ")
    user_query = input("Enter the query to match: ")
    start_time = time.time()
    best_match, df, metrics = process_data(file_path, user_query)
    if best_match is not None:
        index, similarity = best_match
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Best Match Found at index:", index)
        print("Similarity Score:", similarity)
        print(df.drop(columns=['ground_truth']).iloc[index])
    else:
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Sorry, No match found with the input query, Please try again!")
    end_time = time.time()
    runtime = end_time - start_time
    print(f"Runtime: {runtime} seconds")
    print("Classification Report:")
    print(metrics)
if __name__ == "__main__":
    main()

File path: /Users/pranavvenkata/Downloads/table.csv
Query given: black netherlands the man in
Best Match Found at index: 6
Similarity Score: 0.6661016089575631
title                                 black man in the netherlands
author                                          francio guadeloupe
category                                              anthropology
publish_year                               first published in 2022
title_id                                        /works/ol25915217w
author_id                   /authors/ol8236241a/francio_guadeloupe
cover_url        //covers.openlibrary.org/b/olid/ol34977443m-m.jpg
book_stats       publish date 2022|publisher  university press ...
descriptions     this edition doesn't have a description yet. c...
reading_stats       0 want to read|0 currently reading|0 have read
Name: 6, dtype: object
Runtime: 5.0584001541137695 seconds
Classification Report:
              precision    recall  f1-score  support
0              0.989899  0.989899  