In [None]:
#install libraries
!pip install --upgrade lightgbm -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.6/3.6 MB[0m [31m117.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
!pip install sentence-transformers -q

In [None]:
#Imports necessary Python libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import joblib

In [None]:
#Mounts Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load Dataset
user_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/User_Preference.csv")
group_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Group_Dataset.csv")
interaction_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/User-Group_Interactions_Dataset.csv")

In [None]:
# Step 1: Preprocess multi-label fields
def split_multi(x):
    if pd.isna(x):
        return []
    return [i.strip() for i in x.split(',')] if isinstance(x, str) else []

user_df['User_Interest'] = user_df['User_Interest'].apply(split_multi)
user_df['Preferred_Destination'] = user_df['Preferred_Destination'].apply(split_multi)
group_df['Group_Interest'] = group_df['Group_Interest'].apply(split_multi)
group_df['Destinations_Planned'] = group_df['Destinations_Planned'].apply(split_multi)

In [None]:
#Step 2: Merge DataFrames into a single DataFrame
df = interaction_df.merge(user_df, on='User_ID').merge(group_df, on='Group_ID')

In [None]:
#Step 3: Ensure no missing values
f['Budget_x'] = df['Budget_x'].fillna('Unknown')
df['Travel_Style_x'] = df['Travel_Style_x'].fillna('Unknown')
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Current_Members'] = df['Current_Members'].fillna(df['Current_Members'].median())

In [None]:
#Step 4: Captures the semantic similarity
sbert = SentenceTransformer('all-MiniLM-L6-v2')

df['User_Interest_Text'] = df['User_Interest'].apply(lambda x: ', '.join(x))
df['Group_Interest_Text'] = df['Group_Interest'].apply(lambda x: ', '.join(x))
df['User_Destination_Text'] = df['Preferred_Destination'].apply(lambda x: ', '.join(x))
df['Group_Destination_Text'] = df['Destinations_Planned'].apply(lambda x: ', '.join(x))

user_interest_emb = sbert.encode(df['User_Interest_Text'].tolist(), convert_to_tensor=True)
group_interest_emb = sbert.encode(df['Group_Interest_Text'].tolist(), convert_to_tensor=True)
user_dest_emb = sbert.encode(df['User_Destination_Text'].tolist(), convert_to_tensor=True)
group_dest_emb = sbert.encode(df['Group_Destination_Text'].tolist(), convert_to_tensor=True)

df['Interest_Similarity'] = [
    float(np.dot(u.cpu().numpy(), g.cpu().numpy()) /
          (np.linalg.norm(u.cpu().numpy()) * np.linalg.norm(g.cpu().numpy())))
    for u, g in zip(user_interest_emb, group_interest_emb)
]

df['Destination_Similarity'] = [
    float(np.dot(u.cpu().numpy(), g.cpu().numpy()) /
          (np.linalg.norm(u.cpu().numpy()) * np.linalg.norm(g.cpu().numpy())))
    for u, g in zip(user_dest_emb, group_dest_emb)
]


In [None]:
#Step 5: Encodes categorical features
onehot_budget = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_style = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
budget_encoded = onehot_budget.fit_transform(df[['Budget_x']])
style_encoded = onehot_style.fit_transform(df[['Travel_Style_x']])

scaler = MinMaxScaler()
age_scaled = scaler.fit_transform(df[['Age']])
members_scaled = scaler.fit_transform(df[['Current_Members']])

# Save encoders + scalers for inference
joblib.dump(onehot_budget, '/content/drive/MyDrive/budget_encoder.pkl')
joblib.dump(onehot_style, '/content/drive/MyDrive/style_encoder.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/feature_scaler.pkl')

['/content/drive/MyDrive/feature_scaler.pkl']

In [None]:
#Step 6: Prepares data for the ranking model
X = np.hstack([
    age_scaled,
    budget_encoded,
    style_encoded,
    df[['Interest_Similarity', 'Destination_Similarity']].values,
    members_scaled
])
y = (df['Joined'] == 'Yes').astype(int).values
user_ids = df['User_ID'].values

In [None]:
#Step 6: Prepares data for evaluation and training
train_users, test_users = train_test_split(np.unique(user_ids), test_size=0.2, random_state=42)
train_mask = df['User_ID'].isin(train_users)

X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]
train_group = df[train_mask].groupby('User_ID').size().tolist()
test_group = df[~train_mask].groupby('User_ID').size().tolist()
test_user_ids = user_ids[~train_mask]


In [None]:
#Step 7:  Builds a model that ranks groups
train_data = lgb.Dataset(X_train, label=y_train, group=train_group)
valid_data = lgb.Dataset(X_test, label=y_test, group=test_group)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'learning_rate': 0.1,
    'max_depth': 7,
    'verbosity': -1,
    'random_state': 42
}

callbacks = [
    lgb.early_stopping(stopping_rounds=10),
    lgb.log_evaluation(period=10)
]

ranker = lgb.train(
    params=params,
    train_set=train_data,
    num_boost_round=150,
    valid_sets=[valid_data],
    valid_names=['validation'],
    callbacks=callbacks
)

Training until validation scores don't improve for 10 rounds
[10]	validation's ndcg@1: 0.66	validation's ndcg@2: 0.735712	validation's ndcg@3: 0.853632	validation's ndcg@4: 0.853632	validation's ndcg@5: 0.853632
Early stopping, best iteration is:
[1]	validation's ndcg@1: 0.73	validation's ndcg@2: 0.775178	validation's ndcg@3: 0.883427	validation's ndcg@4: 0.883427	validation's ndcg@5: 0.883427


In [None]:
#Step 8: Assesses the effectiveness of the ranking model 
from sklearn.metrics import ndcg_score

def group_by_user(y_true, y_pred, user_ids):
    grouped_true, grouped_pred = [], []
    for uid in np.unique(user_ids):
        mask = user_ids == uid
        grouped_true.append(y_true[mask])
        grouped_pred.append(y_pred[mask])
    return grouped_true, grouped_pred

y_pred = ranker.predict(X_test)
true_groups, pred_groups = group_by_user(y_test, y_pred, test_user_ids)

ndcg_at_3 = np.mean([
    ndcg_score([true], [pred], k=3)
    for true, pred in zip(true_groups, pred_groups)
    if len(true) >= 3
])
print("✅ NDCG@3 Score:", ndcg_at_3)

✅ NDCG@3 Score: 0.6430903102557878


In [None]:
#save the model
model_path = "/content/drive/MyDrive/trained_lgbm_ranker_semantic.txt"
ranker.save_model(model_path)
print("✅ Model saved to:", model_path)

✅ Model saved to: /content/drive/MyDrive/trained_lgbm_ranker_semantic.txt


In [35]:
import joblib
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import lightgbm as lgb

# Load saved components
model = lgb.Booster(model_file="/content/drive/MyDrive/trained_lgbm_ranker_semantic.txt")
budget_encoder = joblib.load("/content/drive/MyDrive/budget_encoder.pkl")
style_encoder = joblib.load("/content/drive/MyDrive/style_encoder.pkl")
scaler = joblib.load("/content/drive/MyDrive/feature_scaler.pkl")
sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Load group dataset
group_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Group_Dataset.csv")
group_df['Group_Interest'] = group_df['Group_Interest'].fillna('').apply(lambda x: [i.strip() for i in x.split(',')])
group_df['Destinations_Planned'] = group_df['Destinations_Planned'].fillna('').apply(lambda x: [i.strip() for i in x.split(',')])
def recommend_groups_for_user(age, budget, interests, preferred_destinations=None, travel_style="Unknown"):
    # Clean up destinations list
    preferred_destinations = preferred_destinations if preferred_destinations else []

    # Compute user embeddings
    user_interest_text = ", ".join(interests)
    user_interest_emb = sbert.encode([user_interest_text])[0]

    if preferred_destinations:
        user_dest_text = ", ".join(preferred_destinations)
        user_dest_emb = sbert.encode([user_dest_text])[0]
    else:
        user_dest_emb = np.zeros(384)  # Fallback for destination

    # Group embeddings
    group_interest_embs = sbert.encode(group_df['Group_Interest'].apply(lambda x: ', '.join(x)).tolist())
    group_dest_embs = sbert.encode(group_df['Destinations_Planned'].apply(lambda x: ', '.join(x)).tolist())

    # Similarity scores
    interest_sims = np.dot(group_interest_embs, user_interest_emb) / (
        np.linalg.norm(group_interest_embs, axis=1) * np.linalg.norm(user_interest_emb)
    )

    dest_sims = np.dot(group_dest_embs, user_dest_emb) / (
        np.linalg.norm(group_dest_embs, axis=1) * np.linalg.norm(user_dest_emb)
    ) if preferred_destinations else np.zeros(len(group_df))

    # Encode other features
    budget_encoded = budget_encoder.transform([[budget]])
    style_encoded = style_encoder.transform([[travel_style]])  # No travel style from user
    age_scaled = np.repeat(scaler.transform([[age]])[:, 0], len(group_df))
    members_scaled = scaler.transform(group_df[['Current_Members']])[:, 0]

    # Final feature matrix
    X_new = np.hstack([
        age_scaled.reshape(-1, 1),
        np.repeat(budget_encoded, len(group_df), axis=0),
        np.repeat(style_encoded, len(group_df), axis=0),
        interest_sims.reshape(-1, 1),
        dest_sims.reshape(-1, 1),
        members_scaled.reshape(-1, 1)
    ])

    # Predict scores
    scores = model.predict(X_new)
    group_df['Score'] = scores
    top_3 = group_df.sort_values(by='Score', ascending=False).head(3)

    return top_3[['Group_ID', 'Group_Interest', 'Destinations_Planned', 'Current_Members', 'Score']]


In [47]:
recommend_groups_for_user(
    age=23,
    budget="High",
    interests=["Hiking", "Nature"],
    travel_style="Backpacking"
)




Unnamed: 0,Group_ID,Group_Interest,Destinations_Planned,Current_Members,Score
19,G0020,"[Nature Walks, Surfing]","[Jaffna Fort, Polonnaruwa, Dagoba of Thuparama...",8,0.151932
118,G0119,"[Sightseeing, Wildlife]","[Mount Lavinia Beach, National Zoological Gard...",9,0.151932
74,G0075,"[Meditation, Hiking]","[Negombo Beach, Abhayagiri Dagaba, Mount Lavin...",6,0.151932
