In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
exhibitors_df=pd.read_csv('/content/exhibitors_themes_top5_category.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")
visitors_df=pd.read_csv('/content/visitors_themes.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")

In [4]:
def compute_semantic_similarity(text1, text2, model):
    """
    Compute cosine similarity between the embeddings of two texts.
    """
    emb1 = model.encode(text1, normalize_embeddings=True)
    emb2 = model.encode(text2, normalize_embeddings=True)
    return cosine_similarity([emb1], [emb2])[0][0]

In [5]:
def extract_categories(text):
    if isinstance(text, pd.Series):
        text = " | ".join(text.astype(str))
    if not isinstance(text, str) or not text.strip():
        return set()

    # Split only on pipes, commas, or semicolons with optional spaces around them
    categories = re.split(r'\s*[|,;]\s*', text.lower().strip())
    # Remove empty strings after splitting
    return set(filter(None, categories))


In [18]:
def recommend_visitors_for_exhibitor(exhibitor_id, visitors_df, exhibitors_df, model, top_n=7, weight_semantic=0.6, weight_overlap=0.4):

    # 1. Retrieve Exhibitor Data
    exhibitor_rows = exhibitors_df[exhibitors_df['exhibitorid'] == exhibitor_id]

    combined_categories=" | ".join(exhibitor_rows['MainCategories'].unique())

    #Extract exhibitor categories
    exhibitor_extracted_categories = extract_categories(exhibitor_rows['MainCategories'])

    visitor_grouped_df = visitors_df.groupby('visitorId').agg({
    'email': 'first',
    'gender': 'first',
    'questionId': lambda x: " | ".join(x.astype(str)),
    'answer': lambda x: " | ".join(x.astype(str)),
    }).reset_index()


    # Extract visitor categories from the combined answer text.
    # visitor_grouped_df['visitor_extracted_Categories'] = visitor_grouped_df['answer'].apply(lambda ans: extract_categories(ans))

    # 4. Compute Overlap Score between Exhibitor and Visitor Categories
    def compute_category_similarity(exhibitor_categories,visitor_categories):
        #print(exhibitor_categories,visitor_categories)
        if not visitor_categories or not exhibitor_categories:
            return 0
        v_embeddings = model.encode(list(visitor_categories))
        e_embeddings = model.encode(list(exhibitor_categories))
        similarity_matrix = cosine_similarity(v_embeddings, e_embeddings)
        return np.max(similarity_matrix)

    visitor_grouped_df['OverlapScore'] = visitor_grouped_df['answer'].apply(
        lambda x: compute_category_similarity(exhibitor_extracted_categories, extract_categories(x))
    )
    #print(visitor_grouped_df['OverlapScore'])

    # 5. Rank Visitors by Final Score and Select the Top N
    top_visitors = visitor_grouped_df.sort_values(by='OverlapScore', ascending=False).head(top_n)
    print(top_visitors)
    # Return a dictionary mapping visitorId to FinalWeightedScore
    recommendations = dict(zip(top_visitors['visitorId'], top_visitors['OverlapScore']))
    return recommendations


In [19]:
example_exhibitor_id = 92462
example_recommendations = recommend_visitors_for_exhibitor(example_exhibitor_id, visitors_df, exhibitors_df, model, top_n=7)
example_recommendations

                   visitorId                                          email  \
0   0wcaegyyobblhhvfzwyibn0a             daniela.p+203_NfBj_lrIk@bss.com.mk   
23  67b5e0f7774d9e718c7541db                         3990147_SeNs@gmail.com   
25  67b5f1392d21f543a10965f1       aleksandar.dimkov+mitt10_V0iB@bss.com.mk   
4   3p80z1iocd67z0qvg8ju1cc0  aleksandar.dimkov+mitt10_V0iB_2bw2@bss.com.mk   
30  67b70e142d21f543a1096609                  daniela.p+203_NfBj@bss.com.mk   
21  67b47874197e604dd2722d6f                  daniela.p+201_Fwae@bss.com.mk   
74  nwgueloa1n8u9v2eegzvfamb             daniela.p+201_Fwae_HTYS@bss.com.mk   

    gender                                         questionId  \
0        1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
23       1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
25       1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
4        1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
30       0  5c8a78336d41a10da4f730fe | 5c8

{'0wcaegyyobblhhvfzwyibn0a': 0.7569289207458496,
 '67b5e0f7774d9e718c7541db': 0.7569289207458496,
 '67b5f1392d21f543a10965f1': 0.7569289207458496,
 '3p80z1iocd67z0qvg8ju1cc0': 0.7569289207458496,
 '67b70e142d21f543a1096609': 0.7569289207458496,
 '67b47874197e604dd2722d6f': 0.7569289207458496,
 'nwgueloa1n8u9v2eegzvfamb': 0.7569289207458496}