In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
exhibitors_df=pd.read_csv('/content/exhibitors_themes_top5_category.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")
visitors_df=pd.read_csv('/content/visitors_themes.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")

In [None]:
def compute_semantic_similarity(text1, text2, model):
    """
    Compute cosine similarity between the embeddings of two texts.
    """
    emb1 = model.encode(text1, normalize_embeddings=True)
    emb2 = model.encode(text2, normalize_embeddings=True)
    return cosine_similarity([emb1], [emb2])[0][0]

In [None]:
def extract_categories(text):
    if isinstance(text, pd.Series):
        text = " | ".join(text.astype(str))
    if not isinstance(text, str) or not text.strip():
        return set()

    # Split only on pipes, commas, or semicolons with optional spaces around them
    categories = re.split(r'\s*[|,;]\s*', text.lower().strip())
    # Remove empty strings after splitting
    return set(filter(None, categories))


# Recommendation Process for Visitors for an Exhibitor

This function recommends the top visitors for a given exhibitor based on the overlap of categories between the exhibitor's main categories and the visitor's combined answers. The process is as follows:

1. **Retrieve Exhibitor Data:**  
   - Filter the exhibitors DataFrame to get all rows corresponding to the specified exhibitor.
   - Combine the unique `MainCategories` values into a single string (using a separator such as " | ").

2. **Extract Exhibitor Categories:**  
   - Use the `extract_categories` function to extract a set of keywords/categories from the exhibitor’s main categories.

3. **Group Visitor Data by Visitor ID:**  
   - Group the visitors DataFrame by `visitorId` and combine all answers for each visitor into one text string.
   - Other fields such as `email` and `gender` are aggregated by taking the first occurrence (assuming they are consistent per visitor).

4. **Compute Overlap Score:**  
   - For each visitor, extract categories from their combined answer text using the `extract_categories` function.
   - Compute an overlap score by comparing the visitor's extracted categories with the exhibitor's categories.
   - The comparison is done using an embedding model and cosine similarity, returning the maximum similarity score.

5. **Rank and Recommend:**  
   - Sort the visitors by their overlap score in descending order.
   - Return the top N visitors (based on the `top_n` parameter) as recommendations, keyed by visitorId and the corresponding score.


In [None]:
def recommend_visitors_for_exhibitor(exhibitor_id, visitors_df, exhibitors_df, model, top_n=7, weight_semantic=0.6, weight_overlap=0.4):

    # 1. Retrieve Exhibitor Data
    exhibitor_rows = exhibitors_df[exhibitors_df['exhibitorid'] == exhibitor_id]

    combined_categories=" | ".join(exhibitor_rows['MainCategories'].unique())

    #Extract exhibitor categories
    exhibitor_extracted_categories = extract_categories(exhibitor_rows['MainCategories'])

    visitor_grouped_df = visitors_df.groupby('visitorId').agg({
    'email': 'first',
    'gender': 'first',
    'questionId': lambda x: " | ".join(x.astype(str)),
    'answer': lambda x: " | ".join(x.astype(str)),
    }).reset_index()


    # Extract visitor categories from the combined answer text.
    # visitor_grouped_df['visitor_extracted_Categories'] = visitor_grouped_df['answer'].apply(lambda ans: extract_categories(ans))

    # 4. Compute Overlap Score between Exhibitor and Visitor Categories
    def compute_category_similarity(exhibitor_categories,visitor_categories):
        #print(exhibitor_categories,visitor_categories)
        if not visitor_categories or not exhibitor_categories:
            return 0
        v_embeddings = model.encode(list(visitor_categories))
        e_embeddings = model.encode(list(exhibitor_categories))
        similarity_matrix = cosine_similarity(v_embeddings, e_embeddings)
        return np.max(similarity_matrix)

    visitor_grouped_df['OverlapScore'] = visitor_grouped_df['answer'].apply(
        lambda x: compute_category_similarity(exhibitor_extracted_categories, extract_categories(x))
    )
    #print(visitor_grouped_df['OverlapScore'])

    # 5. Rank Visitors by Final Score and Select the Top N
    top_visitors = visitor_grouped_df.sort_values(by='OverlapScore', ascending=False).head(top_n)
    print(top_visitors)
    # Return a dictionary mapping visitorId to FinalWeightedScore
    recommendations = dict(zip(top_visitors['visitorId'], top_visitors['OverlapScore']))
    return recommendations


In [None]:
example_exhibitor_id = 92462
example_recommendations = recommend_visitors_for_exhibitor(example_exhibitor_id, visitors_df, exhibitors_df, model, top_n=7)
example_recommendations

                   visitorId                                          email  \
0   0wcaegyyobblhhvfzwyibn0a             daniela.p+203_NfBj_lrIk@bss.com.mk   
23  67b5e0f7774d9e718c7541db                         3990147_SeNs@gmail.com   
25  67b5f1392d21f543a10965f1       aleksandar.dimkov+mitt10_V0iB@bss.com.mk   
4   3p80z1iocd67z0qvg8ju1cc0  aleksandar.dimkov+mitt10_V0iB_2bw2@bss.com.mk   
30  67b70e142d21f543a1096609                  daniela.p+203_NfBj@bss.com.mk   
21  67b47874197e604dd2722d6f                  daniela.p+201_Fwae@bss.com.mk   
74  nwgueloa1n8u9v2eegzvfamb             daniela.p+201_Fwae_HTYS@bss.com.mk   

    gender                                         questionId  \
0        1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
23       1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
25       1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
4        1  5c8a78336d41a10da4f730fe | 5c8a78336d41a10da4f...   
30       0  5c8a78336d41a10da4f730fe | 5c8

{'0wcaegyyobblhhvfzwyibn0a': 0.7569289207458496,
 '67b5e0f7774d9e718c7541db': 0.7569289207458496,
 '67b5f1392d21f543a10965f1': 0.7569289207458496,
 '3p80z1iocd67z0qvg8ju1cc0': 0.7569289207458496,
 '67b70e142d21f543a1096609': 0.7569289207458496,
 '67b47874197e604dd2722d6f': 0.7569289207458496,
 'nwgueloa1n8u9v2eegzvfamb': 0.7569289207458496}

**Test Cases**

In [None]:
import unittest
from io import StringIO

In [None]:
class DummyModel:
    def encode(self, texts):
        """
        For testing purposes, this dummy encode method converts each string into a 2D vector.
        Here we use:
            vector = [length of text, sum of ASCII values of text mod 10]
        """
        vectors = []
        for text in texts:
            length = len(text)
            ascii_sum_mod = sum(ord(c) for c in text) % 10
            vectors.append([length, ascii_sum_mod])
        return np.array(vectors)

In [None]:
#generated uing LLM
test_exhibitor_df = pd.read_csv('/content/test_exhibitor_df.csv')
test_visitors_df = pd.read_csv('/content/test_visitors_df.csv')

In [None]:
test_exhibitor_df

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,MainCategories,Cluster,Cluster_Theme
0,0,92840,Global Horizons,hotel/hotel chain/inn,10,Hospitality Services
1,1,92841,Global Horizons,industrial tourism,8,Tourism Specialties
2,2,92842,Sunrise Expeditions,resort operator,10,Hospitality Services
3,3,92843,Sunrise Expeditions,conference center,14,Event Services
4,4,92844,TravelX Solutions,online travel agency,16,Travel Booking Solutions
5,5,92845,TravelX Solutions,destination management company,15,Travel Services
6,6,92846,Urban Escapes,car rental,2,Transport and Payments
7,7,92847,Urban Escapes,travel insurance,4,Financial Services
8,8,92848,Global Explorer,gastro tourism,8,Tourism Specialties
9,9,92849,Global Explorer,cruise line,2,Transport and Payments


In [None]:
test_visitors_df.head()

Unnamed: 0.1,Unnamed: 0,email,gender,visitorId,stepId,questionId,answerValue,answerId,answerTypeId,question,answer,Cluster,Cluster_Theme
0,0,john.doe@example.com,M,V001,S001,Q001,,A001,AT001,What is your job?,Tour operator,2,Travel Industry Services
1,1,jane.smith@example.com,F,V002,S002,Q002,,A002,AT002,Why attend?,Travel Agent; Tour operator,2,Travel Industry Services
2,2,alex.brown@example.com,M,V003,S003,Q001,,A003,AT001,What is your job?,Independent Travel Consultant,3,Travel Services
3,3,emily.white@example.com,F,V004,S004,Q002,,A004,AT002,Why attend?,Industry Networking,1,Event Management
4,4,michael.johnson@example.com,M,V005,S005,Q001,,A005,AT001,What is your job?,Hotel Supplier,4,Hospitality Services


In [None]:
class TestRecommendationFunctions(unittest.TestCase):

    def setUp(self):
        self.visitors_df = test_visitors_df
        self.exhibitors_df = test_exhibitor_df

        self.model = DummyModel()

    def test_extract_categories_string(self):
        text = "apartments/residential hotel | mass market tour operators | independent travel agency"
        expected = {
            'apartments/residential hotel',
            'mass market tour operators',
            'independent travel agency'
        }
        result = extract_categories(text)
        self.assertEqual(result, expected)

    def test_extract_categories_series(self):
        # Test extract_categories when passed a pandas Series
        series = pd.Series(["cat1 | cat2", "cat3"])
        expected = {'cat1', 'cat2', 'cat3'}
        result = extract_categories(series)
        self.assertEqual(result, expected)

    def test_recommend_visitors_for_exhibitor(self):
        print("Exhibitor data for 92848\n\n",test_exhibitor_df[test_exhibitor_df['exhibitorid']==92848])
        recommendations = recommend_visitors_for_exhibitor(92848, self.visitors_df, self.exhibitors_df, self.model, top_n=7)
        self.assertIsInstance(recommendations, dict)

        expected_visitors = self.visitors_df["visitorId"].tolist()  # Ensure correct visitor IDs

        for visitor_id in recommendations.keys():
            self.assertIn(visitor_id, expected_visitors)  # Compare with the actual data

        # Check that the OverlapScore is a float value
        for score in recommendations.values():
            self.assertIsInstance(score, float)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

...
----------------------------------------------------------------------
Ran 3 tests in 0.131s

OK


Exhibitor data for 92848

    Unnamed: 0  exhibitorid             Name  MainCategories  Cluster  \
8           8        92848  Global Explorer  gastro tourism        8   

         Cluster_Theme  
8  Tourism Specialties  
   visitorId                        email gender questionId  \
17      V018         mia.hall@example.com      F       Q002   
7       V008      lisa.wilson@example.com      F       Q002   
10      V011     chris.thomas@example.com      M       Q001   
11      V012     sophia.brown@example.com      F       Q002   
3       V004      emily.white@example.com      F       Q002   
19      V020  charlotte.young@example.com      F       Q002   
4       V005  michael.johnson@example.com      M       Q001   

                            answer  OverlapScore  
17       Networking with hoteliers      0.999997  
7            Meet industry experts      0.999780  
10           Cruise Line Executive      0.999774  
11          Sourcing new suppliers      0.999389  
3              Ind