In [4]:
import pandas as pd
import numpy as np
import os
import time
from sentence_transformers import SentenceTransformer
def load_data(file_path):
    file_extension = os.path.splitext(file_path)[1]
    if file_extension == '.csv':
        return pd.read_csv(file_path).head(100)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path).head(100)
    else:
        raise ValueError("Unsupported file format.")
def lowercase_all(df):
    df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df
def generate_embeddings(model, df):
    embeddings = {}
    for column in df.columns:
        column_embeddings = model.encode(df[column].astype(str).tolist(), convert_to_tensor=True)
        embeddings[column] = column_embeddings.cpu().numpy()
    return embeddings
def calculate_semantic_similarities(df, query, model):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    embeddings = generate_embeddings(model, df)
    results = []
    for index, row in df.iterrows():
        row_similarities = []
        for column in df.columns:
            entry = str(row[column])
            entry_embedding = embeddings[column][index].reshape(1, -1)
            semantic_similarity = np.dot(entry_embedding, query_embedding.T).max()
            row_similarities.append((column, semantic_similarity))
        max_similarity = max(row_similarities, key=lambda x: x[1])[1]
        if max_similarity >= 0.5:
            results.append((index, max_similarity))
    return results
def find_best_match(similarities):
    best_match = max(similarities, key=lambda x: x[1], default=None)
    return best_match
def process_data(file_path, query):
    df = load_data(file_path)
    df = lowercase_all(df)
    model = SentenceTransformer('all-MiniLM-L6-v2', tokenizer_kwargs={'clean_up_tokenization_spaces': True})
    similarities = calculate_semantic_similarities(df, query, model)
    best_match = find_best_match(similarities)
    return best_match, df
def main():
    file_path = input("Enter the path to the file: ")
    user_query = input("Enter the query to match: ")
    start_time = time.time()
    best_match, df= process_data(file_path, user_query)
    if best_match is not None:
        index, similarity = best_match
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Best Match Found at index:", index)
        print("Similarity Score:", similarity)
        print(df.iloc[index])
    else:
        print("File path:", file_path)
        print("Query given:", user_query)
        print("Sorry, No match found with the input query, Please try again!")
    end_time = time.time()
    runtime = end_time - start_time
    print(f"Runtime: {runtime} seconds")
if __name__ == "__main__":
    main()

File path: /Users/pranavvenkata/Downloads/table.csv
Query given: black netherlands man in the 
Best Match Found at index: 6
Similarity Score: 0.9612266
title                                 black man in the netherlands
author                                          francio guadeloupe
category                                              anthropology
publish_year                               first published in 2022
title_id                                        /works/ol25915217w
author_id                   /authors/ol8236241a/francio_guadeloupe
cover_url        //covers.openlibrary.org/b/olid/ol34977443m-m.jpg
book_stats       publish date 2022|publisher  university press ...
descriptions     this edition doesn't have a description yet. c...
reading_stats       0 want to read|0 currently reading|0 have read
Name: 6, dtype: object
Runtime: 6.689584970474243 seconds
