In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Recommendation Process Steps

1. **Retrieve Visitor Data**  
   - Get the visitor's cluster theme and answer text from the visitors DataFrame.
2. **Extract Visitor Categories**  
   - Use `extract_categories` to extract keywords/topics from the visitor's answer.
3. **Compute Semantic Similarity for Clusters**  
   - Compare the visitor's cluster theme with each exhibitor cluster theme to compute a semantic similarity score.
4. **Select Top Clusters**  
   - Sort clusters by semantic similarity and select the top 8 clusters.
5. **Filter Exhibitors**  
   - Keep only exhibitors belonging to the top clusters.
6. **Calculate Category Overlap Score**  
   - For each selected exhibitor, compute the overlap score by comparing visitor categories with exhibitor categories using embeddings.
7. **Aggregate Scores**  
   - Compute an average overlap score for each exhibitor and map the semantic score based on their cluster.
8. **Compute Weighted Scores**  
   - Multiply the semantic score by the number of categories to get a weighted cluster score.
   - Combine the weighted cluster score and average overlap score using the specified weights (e.g., 60% semantic, 40% overlap).
9. **Select and Return Top Exhibitors**  
   - Rank exhibitors by the final weighted score and return the top recommendations.


In [6]:
exhibitors_df=pd.read_csv('/content/exhibitors_themes_top5_category.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")
visitors_df=pd.read_csv('/content/visitors_themes.csv').drop(columns=["Unnamed: 0", "Unnamed: 0.1"], errors="ignore")

In [3]:
def compute_semantic_similarity(text1, text2, model):
    """
    Compute cosine similarity between the embeddings of two texts.
    """
    emb1 = model.encode(text1, normalize_embeddings=True)
    emb2 = model.encode(text2, normalize_embeddings=True)
    return cosine_similarity([emb1], [emb2])[0][0]

In [70]:
def extract_categories(text):
    if not isinstance(text, str) or not text.strip():
        return set()
    # Assuming categories are separated by commas, pipes, or spaces (adjust as needed)
    categories = re.split(r'[|,;\s]+', text.lower().strip())
    # Remove empty strings after splitting
    return set(filter(None, categories))

In [93]:
def recommend_exhibitors_for_visitor(visitor_id, visitors_df, exhibitors_df, model, top_n=7, weight_semantic=0.6, weight_overlap=0.4):
    """
    For a given visitor, recommend the top exhibitors based on a two-step process:

      1. Semantic filtering using cluster themes.
      2. Scoring based on category overlap between the visitor's answer and exhibitor's categories.

    Returns a dictionary where keys are exhibitor names and values are the final ranking scores.
    """
    # Retrieve the visitor's row (assumes visitorId is unique)
    visitor_row = visitors_df[visitors_df['visitorId'] == visitor_id]
    if visitor_row.empty:
        return {}

    # Get the visitor's cluster theme and answer text
    visitor_cluster_theme = visitor_row.iloc[0]['Cluster_Theme']   # e.g., "Travel Industry Services"
    visitor_answer = "|".join(visitors_df[visitors_df["visitorId"] == visitor_id]["answer"].tolist())

    print("visitor_cluster_theme: ",visitor_cluster_theme)
    print("visitor_answer: ",visitor_answer)

    # Extract visitor answer categories
    visitor_categories = extract_categories(visitor_answer)
    # print("visitor_categories",len(visitor_categories))

    #For each exhibitor, compute semantic similarity between the visitor's cluster theme and the exhibitor's cluster theme.
    cluster_df = exhibitors_df[['Cluster', 'Cluster_Theme']].drop_duplicates()
    cluster_df['SemanticScore'] = cluster_df['Cluster_Theme'].apply(lambda x: compute_semantic_similarity(visitor_cluster_theme, x, model))

    #select top 8 cluster
    cluster_df = cluster_df.sort_values(by='SemanticScore', ascending=False).head(8)

    #select only the exhibitors from top 8 clusters
    selected_cluster_ids = cluster_df['Cluster'].tolist()
    selected_exhibitors_df = exhibitors_df[exhibitors_df['Cluster'].isin(selected_cluster_ids)]
    # print(selected_exhibitors_df)

    # # For these top exhibitors, compute category overlap between the visitor's answer and the exhibitor's MainCategories.
    def compute_category_similarity(visitor_categories, exhibitor_categories):
        if not visitor_categories or not exhibitor_categories:
            return 0
        v_embeddings = model.encode(list(visitor_categories))
        e_embeddings = model.encode(list(exhibitor_categories))
        similarity_matrix = cosine_similarity(v_embeddings, e_embeddings)
        return np.max(similarity_matrix)  # Get highest similarity score

    selected_exhibitors_df['OverlapScore'] = selected_exhibitors_df['MainCategories'].apply(
        lambda x: compute_category_similarity(visitor_categories, extract_categories(x))
    )
    average_overlap_df = selected_exhibitors_df.groupby('exhibitorid')['OverlapScore'].mean().reset_index()


    selected_exhibitors_df.to_csv('selected_exhibitors_df.csv')

    # print(selected_exhibitors_df)

    scoring_exhibitors_df = selected_exhibitors_df.groupby(['exhibitorid', 'Name', 'Cluster','OverlapScore'])['MainCategories'].nunique().reset_index()

    # print(scoring_exhibitors_df)
    scoring_exhibitors_df = scoring_exhibitors_df.rename(columns={'MainCategories': 'NumberOfCategories'})

    cluster_score_dict=cluster_df.set_index('Cluster')['SemanticScore'].to_dict()
    # print(cluster_score_dict)

    scoring_exhibitors_df['SemanticScore']=scoring_exhibitors_df['Cluster'].map(cluster_score_dict)


    scoring_exhibitors_df['weighted_cluster_score']=scoring_exhibitors_df['NumberOfCategories']*scoring_exhibitors_df['SemanticScore']
    #print(scoring_exhibitors_df)

    weighted_scores = scoring_exhibitors_df.groupby(['exhibitorid', 'Name']).agg({
    'weighted_cluster_score': 'mean',
    'OverlapScore': 'mean'
    }).reset_index()

    # Compute the final weighted score for each exhibitor.
    weighted_scores['FinalWeightedScore'] = (
        weight_semantic * weighted_scores['weighted_cluster_score'] +
        weight_overlap * weighted_scores['OverlapScore']
)

        # Step E: Rank exhibitors by FinalScore and select the top_n
    final_top_exhibitors = weighted_scores.sort_values(by='FinalWeightedScore', ascending=False).head(top_n)

    # Create a dictionary mapping exhibitor name to final score
    recommendations = dict(zip(final_top_exhibitors['Name'], final_top_exhibitors['FinalWeightedScore']))
    return recommendations





In [95]:
# Example call for a specific visitor:
example_visitor_id = "67b5f1392d21f543a10965f1"  # Replace with an actual visitorId
example_recommendations = recommend_exhibitors_for_visitor(example_visitor_id, visitors_df, exhibitors_df, model, top_n=7)
example_recommendations

visitor_cluster_theme:  Tourism Services
visitor_answer:  To source products and services|Visa support|Tour Operator|Joint responsibility|Up to 1 million rubles


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_exhibitors_df['OverlapScore'] = selected_exhibitors_df['MainCategories'].apply(


{'Prime Adventures Journeys': 0.8842186033725739,
 'Global Holidays Expeditions': 0.8446260988712311,
 'Prime Vacations Expeditions': 0.8378853797912598,
 'Royal Tours Expeditions': 0.766187459230423,
 'Dream Getaways Expeditions': 0.7609494268894195,
 'Exotic Tours Journeys': 0.7368350446224212,
 'Exotic Tours Tours': 0.7296145796775817}