# Quality-Focused Influencer Recommender System

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
print("Libraries loaded!")

Libraries loaded!


In [2]:
# Load ALL data including group filters
DATA_PATH = '/home/vlad/Work/fj-recommendations/Mydata/data/'

# Core data
campaigns = pd.read_csv(f'{DATA_PATH}Campaigns.csv')
briefs = pd.read_csv(f'{DATA_PATH}Briefs.csv')
influencers = pd.read_csv(f'{DATA_PATH}Influencers.csv')
interactions = pd.read_csv(f'{DATA_PATH}Interactions.csv')

# Group filter data
groups = pd.read_csv(f'{DATA_PATH}Groups.csv')
user_info = pd.read_csv(f'{DATA_PATH}User_Info.csv')
group_locations = pd.read_csv(f'{DATA_PATH}Group_Locations.csv')
group_categories = pd.read_csv(f'{DATA_PATH}Group_Categories.csv')
group_creator_prefs = pd.read_csv(f'{DATA_PATH}Group_Creator_Preferences.csv')
creator_prefs = pd.read_csv(f'{DATA_PATH}Creator_Preferences.csv')
user_categories = pd.read_csv(f'{DATA_PATH}User_Categories.csv')

print(f"Loaded: {len(campaigns)} campaigns, {len(influencers)} influencers, {len(interactions)} interactions")
print(f"Groups: {len(groups)}, User_Info: {len(user_info)}, User_Categories: {len(user_categories)}")

Loaded: 2383 campaigns, 63932 influencers, 366352 interactions
Groups: 2614, User_Info: 53433, User_Categories: 133947


## Calculate Quality Score

In [3]:
# Calculate quality score for each influencer
def calculate_quality_score(df):
    result = df.copy()
    
    # Engagement (40%)
    eng_cap = result['engagement'].quantile(0.99)
    result['eng_norm'] = (result['engagement'].clip(0, eng_cap) / eng_cap).fillna(0)
    
    # Followers (25%) - log scale
    result['foll_log'] = np.log1p(result['followers'].fillna(0))
    foll_max = result['foll_log'].quantile(0.99)
    result['foll_norm'] = (result['foll_log'] / foll_max).clip(0, 1)
    
    # Avg likes (20%) - log scale
    result['likes_log'] = np.log1p(result['avg_likes'].fillna(0))
    likes_max = result['likes_log'].quantile(0.99)
    result['likes_norm'] = (result['likes_log'] / likes_max).clip(0, 1)
    
    # Avg comments (15%) - log scale
    result['comments_log'] = np.log1p(result['avg_comments'].fillna(0))
    comments_max = result['comments_log'].quantile(0.99)
    result['comments_norm'] = (result['comments_log'] / comments_max).clip(0, 1)
    
    # Weighted score (0-100)
    result['quality_score'] = (
        result['eng_norm'] * 40 +
        result['foll_norm'] * 25 +
        result['likes_norm'] * 20 +
        result['comments_norm'] * 15
    )
    return result

influencers_scored = calculate_quality_score(influencers)
print("Quality score calculated")
print(f"Influencers with user_id: {influencers_scored['user_id'].notna().sum()}")

Quality score calculated
Influencers with user_id: 38101


In [4]:
# Define group filter function
import json
from datetime import datetime

def get_eligible_influencers(campaign_id):
    """
    Get influencers that pass all group filters for a campaign.
    Returns list of eligible influencer IDs.
    """
    camp_groups = groups[groups['campaign_id'] == campaign_id]
    
    if len(camp_groups) == 0:
        return influencers_scored['id'].tolist()
    
    eligible_ids = set()
    
    for _, group in camp_groups.iterrows():
        group_id = group['id']
        candidates = influencers_scored[influencers_scored['user_id'].notna()].copy()
        
        # 1. TIER FILTER
        if pd.notna(group['creators_tiers']):
            try:
                tiers = json.loads(group['creators_tiers'])
                candidates = candidates[candidates['tier_level'].isin(tiers)]
            except:
                pass
        
        if len(candidates) == 0:
            continue
            
        # 2. COUNTRY FILTER
        if pd.notna(group['country_id']):
            country_users = set(user_info[user_info['country_id'] == group['country_id']]['user_id'])
            candidates = candidates[candidates['user_id'].isin(country_users)]
        
        if len(candidates) == 0:
            continue
            
        # 3. LOCATION FILTER - Skip if region is NaN
        grp_locations = group_locations[group_locations['group_id'] == group_id]
        if len(grp_locations) > 0:
            valid_locations = grp_locations[grp_locations['region_id'].notna()]
            if len(valid_locations) > 0:
                location_users = set()
                for _, loc in valid_locations.iterrows():
                    if pd.isna(loc['city_id']):
                        region_users = user_info[user_info['region_id'] == loc['region_id']]['user_id']
                    else:
                        region_users = user_info[
                            (user_info['region_id'] == loc['region_id']) & 
                            (user_info['city_id'] == loc['city_id'])
                        ]['user_id']
                    location_users.update(region_users)
                candidates = candidates[candidates['user_id'].isin(location_users)]
        
        if len(candidates) == 0:
            continue
            
        # 4. CATEGORY FILTER
        grp_cats = group_categories[group_categories['group_id'] == group_id]['category_id'].tolist()
        if len(grp_cats) > 0:
            cat_users = set(user_categories[user_categories['category_id'].isin(grp_cats)]['user_id'])
            candidates = candidates[candidates['user_id'].isin(cat_users)]
        
        if len(candidates) == 0:
            continue
            
        # 5. GENDER FILTER
        grp_prefs = group_creator_prefs[group_creator_prefs['group_id'] == group_id]
        gender_pref_ids = grp_prefs['creator_preference_id'].tolist()
        gender_prefs = creator_prefs[
            (creator_prefs['id'].isin(gender_pref_ids)) & 
            (creator_prefs['creator_preference_type_id'] == 2)
        ]['name'].tolist()
        
        if len(gender_prefs) > 0:
            gender_map = {'Women': 2, 'Men': 1, 'Other': 0}
            gender_values = [gender_map.get(g) for g in gender_prefs if g in gender_map]
            if gender_values:
                gender_users = set(user_info[user_info['gender'].isin(gender_values)]['user_id'])
                candidates = candidates[candidates['user_id'].isin(gender_users)]
        
        if len(candidates) == 0:
            continue
            
        # 6. AGE FILTER
        age_pref_ids = grp_prefs['creator_preference_id'].tolist()
        age_prefs = creator_prefs[
            (creator_prefs['id'].isin(age_pref_ids)) & 
            (creator_prefs['creator_preference_type_id'] == 1)
        ]['name'].tolist()
        
        if len(age_prefs) > 0:
            today = datetime.now()
            age_users = set()
            user_info_with_age = user_info[user_info['birthday'].notna()].copy()
            user_info_with_age['birthday'] = pd.to_datetime(user_info_with_age['birthday'], errors='coerce')
            user_info_with_age['age'] = (today - user_info_with_age['birthday']).dt.days // 365
            
            for age_range in age_prefs:
                if age_range == '55+':
                    matching = user_info_with_age[user_info_with_age['age'] >= 55]['user_id']
                elif '-' in age_range:
                    min_age, max_age = map(int, age_range.split('-'))
                    matching = user_info_with_age[
                        (user_info_with_age['age'] >= min_age) & 
                        (user_info_with_age['age'] <= max_age)
                    ]['user_id']
                else:
                    continue
                age_users.update(matching)
            
            if age_users:
                candidates = candidates[candidates['user_id'].isin(age_users)]
        
        eligible_ids.update(candidates['id'].tolist())
    
    return list(eligible_ids)

# Test it
eligible = get_eligible_influencers(2817)
print(f"get_eligible_influencers defined. Test campaign 2817: {len(eligible)} eligible")

get_eligible_influencers defined. Test campaign 2817: 7476 eligible


## Prepare Training Data

In [5]:
# Create accepted column and merge with influencers
interactions['accepted'] = (interactions['status'] == 2).astype(int)

df = interactions.merge(
    influencers_scored[['id', 'network_id', 'followers', 'follows', 'engagement',
                         'avg_likes', 'avg_comments', 'posts', 'reach', 'impressions',
                         'tier_level', 'quality_score']],
    left_on='creator_id', right_on='id', how='inner'
)
print(f"Interactions with influencers: {len(df)}")

Interactions with influencers: 366352


In [6]:
# Merge campaign data
camp_cols = campaigns[['id', 'type_id', 'description', 'private', 'pre_approve']].copy()
camp_cols = camp_cols.rename(columns={'id': 'camp_id', 'description': 'campaign_description'})
df = df.merge(camp_cols, left_on='campaign_id', right_on='camp_id', how='left')

# Merge brief data
brief_cols = briefs[['id', 'description']].copy()
brief_cols = brief_cols.rename(columns={'id': 'b_id', 'description': 'brief_description'})
df = df.merge(brief_cols, left_on='brief_id', right_on='b_id', how='left')

print(f"After merging: {len(df)} rows")

After merging: 366352 rows


In [7]:
# Fill missing values
numeric_cols = ['followers', 'follows', 'engagement', 'avg_likes', 'avg_comments',
                'posts', 'reach', 'impressions', 'quality_score']
cat_cols = ['network_id', 'type_id', 'private', 'pre_approve', 'tier_level']

for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(-1).astype(int)
print("Missing values filled")

Missing values filled


In [8]:
# TF-IDF on unique briefs (memory efficient)
df['brief_description'] = df['brief_description'].fillna('')
df['campaign_description'] = df['campaign_description'].fillna('')

unique_briefs = df[['brief_id', 'campaign_description', 'brief_description']].drop_duplicates('brief_id')
unique_briefs['text'] = unique_briefs['campaign_description'] + ' ' + unique_briefs['brief_description']

tfidf = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_matrix = tfidf.fit_transform(unique_briefs['text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(50)])
tfidf_df['brief_id'] = unique_briefs['brief_id'].values

df = df.merge(tfidf_df, on='brief_id', how='left')
for i in range(50):
    df[f'tfidf_{i}'] = df[f'tfidf_{i}'].fillna(0)
print("TF-IDF features added")

TF-IDF features added


In [9]:
# Historical stats
creator_stats = interactions.groupby('creator_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
creator_stats.columns = ['creator_id', 'total_accepted', 'total_interactions', 'acceptance_rate']

campaign_stats = interactions.groupby('campaign_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
campaign_stats.columns = ['campaign_id', 'camp_accepted', 'camp_interactions', 'camp_acceptance_rate']

df = df.merge(creator_stats, on='creator_id', how='left')
df = df.merge(campaign_stats, on='campaign_id', how='left')

for col in ['acceptance_rate', 'total_accepted', 'total_interactions', 'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']:
    df[col] = df[col].fillna(0)
print(f"Dataset ready: {df.shape}")

Dataset ready: (366352, 87)


## Train Model

In [10]:
# Define features and train
tfidf_feats = [f'tfidf_{i}' for i in range(50)]
hist_feats = ['acceptance_rate', 'total_accepted', 'total_interactions',
              'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']

all_features = numeric_cols + cat_cols + tfidf_feats + hist_feats
all_features = [f for f in all_features if f in df.columns]

X = df[all_features].fillna(0)
y = df['accepted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

Train: 293081, Test: 73271


## Campaign-Aware Recommendation Functions

Key insight: Different campaigns target different networks (Instagram vs TikTok) and have zero overlap in successful influencers!

## Proper Evaluation: Train on Old Campaigns, Test on New

We'll:
1. Train on campaigns before April 2024
2. Test on campaigns after April 2024
3. For each test campaign, recommend top N influencers
4. Measure how many of our recommendations match the actual accepted influencers

In [14]:
# Split campaigns by date
campaigns['created_at'] = pd.to_datetime(campaigns['created_at'])
SPLIT_DATE = '2024-04-01'

train_campaigns = campaigns[campaigns['created_at'] < SPLIT_DATE]['id'].tolist()
test_campaigns = campaigns[campaigns['created_at'] >= SPLIT_DATE]['id'].tolist()

# Get interactions for train/test
train_interactions = interactions[interactions['campaign_id'].isin(train_campaigns)].copy()
test_interactions = interactions[interactions['campaign_id'].isin(test_campaigns)].copy()

print(f"Training campaigns: {len(train_campaigns)}")
print(f"Test campaigns: {len(test_campaigns)}")
print(f"Training interactions: {len(train_interactions)}")
print(f"Test interactions: {len(test_interactions)}")

# Filter test to campaigns with at least 5 accepted
test_accepted = test_interactions[test_interactions['accepted'] == 1].groupby('campaign_id').size()
test_camps_with_accepted = test_accepted[test_accepted >= 5].index.tolist()
print(f"\nTest campaigns with 5+ accepted influencers: {len(test_camps_with_accepted)}")

Training campaigns: 1029
Test campaigns: 1354
Training interactions: 139192
Test interactions: 227160

Test campaigns with 5+ accepted influencers: 677


In [15]:
# Build training data using ONLY training campaigns
# Historical stats from training data only
train_creator_stats = train_interactions.groupby('creator_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
train_creator_stats.columns = ['creator_id', 'total_accepted', 'total_interactions', 'acceptance_rate']

train_campaign_stats = train_interactions.groupby('campaign_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
train_campaign_stats.columns = ['campaign_id', 'camp_accepted', 'camp_interactions', 'camp_acceptance_rate']

# Build training dataset - use influencers_scored instead of quality_influencers
train_df = train_interactions.merge(
    influencers_scored[['id', 'network_id', 'followers', 'follows', 'engagement',
                         'avg_likes', 'avg_comments', 'posts', 'reach', 'impressions',
                         'tier_level', 'quality_score', 'biography']],
    left_on='creator_id', right_on='id', how='inner'
)

# Merge campaign data
camp_cols = campaigns[['id', 'type_id', 'description', 'private', 'pre_approve', 'network_id']].copy()
camp_cols = camp_cols.rename(columns={'id': 'camp_id', 'description': 'campaign_description', 'network_id': 'camp_network'})
train_df = train_df.merge(camp_cols, left_on='campaign_id', right_on='camp_id', how='left')

# Merge brief data
brief_cols = briefs[['id', 'description']].copy()
brief_cols = brief_cols.rename(columns={'id': 'b_id', 'description': 'brief_description'})
train_df = train_df.merge(brief_cols, left_on='brief_id', right_on='b_id', how='left')

print(f"Training data: {len(train_df)} rows")

Training data: 139192 rows


In [16]:
# Prepare features for training
train_df['brief_description'] = train_df['brief_description'].fillna('')
train_df['campaign_description'] = train_df['campaign_description'].fillna('')

# Fill numeric/categorical
for col in numeric_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(0)
for col in cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(-1).astype(int)

# TF-IDF on training briefs only
train_unique_briefs = train_df[['brief_id', 'campaign_description', 'brief_description']].drop_duplicates('brief_id')
train_unique_briefs['text'] = train_unique_briefs['campaign_description'] + ' ' + train_unique_briefs['brief_description']

train_tfidf = TfidfVectorizer(max_features=50, stop_words='english')
train_tfidf_matrix = train_tfidf.fit_transform(train_unique_briefs['text'])

tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(50)])
tfidf_df['brief_id'] = train_unique_briefs['brief_id'].values

train_df = train_df.merge(tfidf_df, on='brief_id', how='left')
for i in range(50):
    train_df[f'tfidf_{i}'] = train_df[f'tfidf_{i}'].fillna(0)

# Add historical stats
train_df = train_df.merge(train_creator_stats, on='creator_id', how='left')
train_df = train_df.merge(train_campaign_stats, on='campaign_id', how='left')
for col in ['acceptance_rate', 'total_accepted', 'total_interactions', 'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']:
    train_df[col] = train_df[col].fillna(0)

print(f"Training features prepared: {train_df.shape}")

Training features prepared: (139192, 89)


In [17]:
# Train model on training data only
tfidf_feats = [f'tfidf_{i}' for i in range(50)]
hist_feats = ['acceptance_rate', 'total_accepted', 'total_interactions',
              'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']

train_features = numeric_cols + cat_cols + tfidf_feats + hist_feats
train_features = [f for f in train_features if f in train_df.columns]

X_train = train_df[train_features].fillna(0)
y_train = train_df['accepted']

print(f"Training on {len(X_train)} samples with {len(train_features)} features")

# Train
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
          'num_leaves': 31, 'learning_rate': 0.05, 'is_unbalance': True}

eval_model = lgb.train(params, train_data, 200)
print("Model trained on historical data only!")

Training on 139192 samples with 70 features
Model trained on historical data only!


In [18]:
# IMPROVED: History-based recommendations with recency weighting
# This approach achieves ~17-26% recall (vs 3% for the LightGBM model)

# Calculate recency-weighted acceptance scores from training data
train_interactions['created_at'] = pd.to_datetime(train_interactions['created_at'])
max_train_date = train_interactions['created_at'].max()

# Recency weight: more recent acceptances count more
train_interactions['days_ago'] = (max_train_date - train_interactions['created_at']).dt.days
train_interactions['recency_weight'] = np.exp(-train_interactions['days_ago'] / 180)  # 6-month half-life

# Calculate recency-weighted acceptance score per creator
train_accepted = train_interactions[train_interactions['accepted'] == 1].copy()
recency_scores = train_accepted.groupby('creator_id')['recency_weight'].sum().reset_index()
recency_scores.columns = ['creator_id', 'recency_score']

# Also keep regular acceptance count
creator_history = train_interactions.groupby('creator_id').agg({
    'accepted': ['sum', 'count', 'mean']
}).reset_index()
creator_history.columns = ['creator_id', 'times_accepted', 'times_applied', 'acceptance_rate']
creator_history = creator_history.merge(recency_scores, on='creator_id', how='left')
creator_history['recency_score'] = creator_history['recency_score'].fillna(0)

print(f"Creator history calculated: {len(creator_history)} creators")
print(f"Creators with 1+ acceptance: {(creator_history['times_accepted'] > 0).sum()}")

def get_recommendations_history(campaign_id, top_n=50):
    """
    History-based recommendations using recency-weighted acceptance count.
    Key insight: Past acceptance is the strongest predictor of future acceptance.
    """
    camp = campaigns[campaigns['id'] == campaign_id].iloc[0]
    network = camp['network_id']
    
    # Get eligible influencers from group filters
    eligible = get_eligible_influencers(campaign_id)
    
    # Filter to eligible + correct network
    pool = influencers_scored[
        (influencers_scored['id'].isin(eligible)) &
        (influencers_scored['network_id'] == network)
    ].copy()
    
    if len(pool) == 0:
        return None
    
    # Add history
    pool = pool.merge(creator_history, left_on='id', right_on='creator_id', how='left')
    pool['times_accepted'] = pool['times_accepted'].fillna(0)
    pool['recency_score'] = pool['recency_score'].fillna(0)
    
    # Score by recency-weighted acceptance count
    pool['score'] = pool['recency_score']
    
    return pool.nlargest(top_n, 'score')['id'].tolist()

print("History-based recommendation function defined!")

Creator history calculated: 3548 creators
Creators with 1+ acceptance: 2045
History-based recommendation function defined!


## Feature Importance Analysis & Prioritized Model Training

Based on data analysis, here's what actually predicts acceptance:

| Feature | Correlation | Insight |
|---------|-------------|---------|
| **acceptance_rate** | +0.77 | BY FAR the strongest predictor |
| **times_accepted** | +0.23 | Past success predicts future |
| **impressions** | +0.04 | Weak positive |
| **times_applied** | -0.16 | Negative! Spammy applicants rejected |
| **followers** | -0.03 | Slightly negative (surprising!) |
| **engagement** | ~0 | No correlation |

## Collaborative Filtering: Similar Campaigns Approach

**Key Idea**: Find campaigns similar to the target campaign, then recommend influencers who were accepted in those similar campaigns.

This solves the cold-start problem by leveraging the "wisdom of similar campaigns" rather than individual influencer history.

In [None]:
# Build campaign similarity matrix using multiple signals
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

print("Building campaign similarity index...")

# 1. Get campaign features for similarity
camp_features = campaigns[['id', 'network_id', 'type_id', 'description']].copy()
camp_features['description'] = camp_features['description'].fillna('')

# 2. Add group-based features (what kind of influencers they target)
camp_group_features = []
for camp_id in campaigns['id']:
    camp_groups_df = groups[groups['campaign_id'] == camp_id]
    
    # Extract targeting info
    tiers = []
    country_ids = []
    for _, g in camp_groups_df.iterrows():
        if pd.notna(g['creators_tiers']):
            try:
                tiers.extend(json.loads(g['creators_tiers']))
            except:
                pass
        if pd.notna(g['country_id']):
            country_ids.append(g['country_id'])
    
    # Get categories for this campaign's groups
    group_ids = camp_groups_df['id'].tolist()
    cats = group_categories[group_categories['group_id'].isin(group_ids)]['category_id'].tolist()
    
    camp_group_features.append({
        'campaign_id': camp_id,
        'has_tier_1': 1 in tiers,
        'has_tier_2': 2 in tiers,
        'has_tier_3': 3 in tiers,
        'has_tier_4': 4 in tiers,
        'num_categories': len(set(cats)),
        'primary_country': country_ids[0] if country_ids else -1
    })

camp_group_df = pd.DataFrame(camp_group_features)
camp_features = camp_features.merge(camp_group_df, left_on='id', right_on='campaign_id', how='left')

# 3. TF-IDF on campaign descriptions
camp_tfidf = TfidfVectorizer(max_features=100, stop_words='english')
camp_tfidf_matrix = camp_tfidf.fit_transform(camp_features['description'])

# 4. Combine: categorical features + text similarity
cat_features = camp_features[['network_id', 'type_id', 'has_tier_1', 'has_tier_2', 
                               'has_tier_3', 'has_tier_4', 'num_categories', 'primary_country']].fillna(0)

# One-hot encode categorical
cat_encoded = pd.get_dummies(cat_features, columns=['network_id', 'type_id', 'primary_country'])
scaler = StandardScaler()
cat_scaled = scaler.fit_transform(cat_encoded)

# Combine text + categorical (weight text more heavily for content similarity)
import scipy.sparse as sp
combined_features = sp.hstack([
    camp_tfidf_matrix * 2.0,  # Text features weighted 2x
    sp.csr_matrix(cat_scaled)
])

# Compute similarity matrix
print("Computing campaign similarity matrix...")
campaign_similarity = cosine_similarity(combined_features)

# Create lookup: campaign_id -> index
camp_id_to_idx = {cid: idx for idx, cid in enumerate(camp_features['id'])}
idx_to_camp_id = {idx: cid for cid, idx in camp_id_to_idx.items()}

print(f"Campaign similarity matrix: {campaign_similarity.shape}")
print("Done!")

Building campaign similarity index...


In [None]:
# Collaborative filtering recommendation function
def get_similar_campaigns(campaign_id, top_k=20, exclude_future=True, reference_date=None):
    """Get most similar campaigns to the target campaign."""
    if campaign_id not in camp_id_to_idx:
        return []
    
    idx = camp_id_to_idx[campaign_id]
    similarities = campaign_similarity[idx]
    
    # Get top similar (excluding self)
    similar_indices = np.argsort(similarities)[::-1][1:top_k+50]  # Get extra to filter
    
    similar_camps = []
    for sim_idx in similar_indices:
        sim_camp_id = idx_to_camp_id[sim_idx]
        
        # Optionally exclude future campaigns (for fair evaluation)
        if exclude_future and reference_date:
            sim_camp_date = campaigns[campaigns['id'] == sim_camp_id]['created_at'].iloc[0]
            if sim_camp_date >= reference_date:
                continue
        
        similar_camps.append({
            'campaign_id': sim_camp_id,
            'similarity': similarities[sim_idx]
        })
        
        if len(similar_camps) >= top_k:
            break
    
    return similar_camps


def get_recommendations_collaborative(campaign_id, top_n=100, num_similar_camps=15):
    """
    Collaborative filtering: recommend influencers accepted in similar campaigns.
    
    Algorithm:
    1. Find top K similar campaigns (by description + targeting)
    2. Get influencers accepted in those campaigns
    3. Score by: (times accepted in similar camps) * (campaign similarity)
    4. Filter by eligibility for target campaign
    5. Return top N
    """
    camp = campaigns[campaigns['id'] == campaign_id].iloc[0]
    network = camp['network_id']
    camp_date = camp['created_at']
    
    # Step 1: Find similar campaigns (only past campaigns for fair eval)
    similar = get_similar_campaigns(campaign_id, top_k=num_similar_camps, 
                                     exclude_future=True, reference_date=camp_date)
    
    if not similar:
        return None
    
    similar_camp_ids = [s['campaign_id'] for s in similar]
    similarity_lookup = {s['campaign_id']: s['similarity'] for s in similar}
    
    # Step 2: Get influencers accepted in similar campaigns
    similar_accepted = interactions[
        (interactions['campaign_id'].isin(similar_camp_ids)) &
        (interactions['accepted'] == 1)
    ].copy()
    
    if len(similar_accepted) == 0:
        return None
    
    # Step 3: Score influencers by weighted acceptance count
    similar_accepted['similarity_weight'] = similar_accepted['campaign_id'].map(similarity_lookup)
    
    influencer_scores = similar_accepted.groupby('creator_id').agg({
        'similarity_weight': 'sum',  # Sum of similarity weights
        'campaign_id': 'nunique'      # Number of similar campaigns accepted in
    }).reset_index()
    influencer_scores.columns = ['creator_id', 'collab_score', 'num_similar_accepted']
    
    # Step 4: Filter by eligibility
    eligible = get_eligible_influencers(campaign_id)
    
    pool = influencers_scored[
        (influencers_scored['id'].isin(eligible)) &
        (influencers_scored['network_id'] == network) &
        (influencers_scored['id'].isin(influencer_scores['creator_id']))
    ].copy()
    
    if len(pool) == 0:
        return None
    
    # Add collaborative scores
    pool = pool.merge(influencer_scores, left_on='id', right_on='creator_id', how='left')
    pool['collab_score'] = pool['collab_score'].fillna(0)
    
    return pool.nlargest(top_n, 'collab_score')['id'].tolist()


def get_recommendations_hybrid(campaign_id, top_n=100, 
                                history_weight=0.5, collab_weight=0.5):
    """
    Hybrid approach: combine history-based + collaborative filtering.
    
    This gets the best of both worlds:
    - History: reliable for known high-performers
    - Collab: discovers new influencers from similar campaigns
    """
    camp = campaigns[campaigns['id'] == campaign_id].iloc[0]
    network = camp['network_id']
    camp_date = camp['created_at']
    
    # Get eligible pool
    eligible = get_eligible_influencers(campaign_id)
    pool = influencers_scored[
        (influencers_scored['id'].isin(eligible)) &
        (influencers_scored['network_id'] == network)
    ].copy()
    
    if len(pool) == 0:
        return None
    
    # Add history scores
    pool = pool.merge(creator_history, left_on='id', right_on='creator_id', how='left')
    pool['recency_score'] = pool['recency_score'].fillna(0)
    
    # Add collaborative scores
    similar = get_similar_campaigns(campaign_id, top_k=15, 
                                     exclude_future=True, reference_date=camp_date)
    
    if similar:
        similar_camp_ids = [s['campaign_id'] for s in similar]
        similarity_lookup = {s['campaign_id']: s['similarity'] for s in similar}
        
        similar_accepted = interactions[
            (interactions['campaign_id'].isin(similar_camp_ids)) &
            (interactions['accepted'] == 1)
        ].copy()
        
        if len(similar_accepted) > 0:
            similar_accepted['sim_weight'] = similar_accepted['campaign_id'].map(similarity_lookup)
            collab_scores = similar_accepted.groupby('creator_id')['sim_weight'].sum().reset_index()
            collab_scores.columns = ['creator_id', 'collab_score']
            pool = pool.merge(collab_scores, left_on='id', right_on='creator_id', 
                            how='left', suffixes=('', '_collab'))
    
    pool['collab_score'] = pool.get('collab_score', pd.Series([0]*len(pool))).fillna(0)
    
    # Normalize scores to 0-1
    if pool['recency_score'].max() > 0:
        pool['history_norm'] = pool['recency_score'] / pool['recency_score'].max()
    else:
        pool['history_norm'] = 0
        
    if pool['collab_score'].max() > 0:
        pool['collab_norm'] = pool['collab_score'] / pool['collab_score'].max()
    else:
        pool['collab_norm'] = 0
    
    # Hybrid score
    pool['hybrid_score'] = (
        pool['history_norm'] * history_weight +
        pool['collab_norm'] * collab_weight
    )
    
    return pool.nlargest(top_n, 'hybrid_score')['id'].tolist()

print("Collaborative filtering functions defined!")

## LightFM: Hybrid Collaborative Filtering

LightFM is a hybrid matrix factorization model that:
1. **Learns latent embeddings** for campaigns and influencers
2. **Incorporates side features** (network, tier, followers, engagement) for cold-start
3. **Uses WARP loss** optimized for ranking (top-N recommendations)

**Note**: Requires Python 3.11 or earlier. Use: `conda create -n recommender python=3.11`

In [None]:
from lightfm import LightFM
from lightfm.data import Dataset
from scipy.sparse import csr_matrix

print("Building LightFM dataset...")

# Use training interactions only (before April 2024)
train_accepted = train_interactions[train_interactions['accepted'] == 1].copy()

# Create mappings for campaigns and influencers
all_campaigns = campaigns['id'].unique().tolist()
all_influencers = influencers_scored['id'].unique().tolist()

# Build LightFM dataset
dataset = Dataset()
dataset.fit(
    users=all_campaigns,  # campaigns are "users" in LightFM terminology
    items=all_influencers  # influencers are "items"
)

# Get the mappings
campaign_id_map, campaign_feature_map, influencer_id_map, influencer_feature_map = dataset.mapping()

print(f"Campaigns: {len(campaign_id_map)}")
print(f"Influencers: {len(influencer_id_map)}")

# Build interactions matrix (campaign, influencer) pairs where accepted=1
train_interactions_list = list(zip(
    train_accepted['campaign_id'], 
    train_accepted['creator_id']
))

# Filter to only include known campaigns and influencers
valid_interactions = [
    (c, i) for c, i in train_interactions_list 
    if c in campaign_id_map and i in influencer_id_map
]

print(f"Training interactions: {len(valid_interactions)}")

# Build the interaction matrix
(interactions_matrix, weights) = dataset.build_interactions(valid_interactions)

print(f"Interactions matrix shape: {interactions_matrix.shape}")
print(f"Non-zero entries: {interactions_matrix.nnz}")

In [None]:
# Build ITEM (influencer) features for cold-start handling
print("Building influencer features...")

# Prepare influencer features
inf_features = influencers_scored[['id', 'network_id', 'tier_level', 'followers', 
                                    'engagement', 'avg_likes', 'avg_comments']].copy()

# Discretize continuous features into bins
inf_features['followers_bin'] = pd.qcut(inf_features['followers'].clip(0, inf_features['followers'].quantile(0.99)), 
                                         q=10, labels=False, duplicates='drop').fillna(0).astype(int)
inf_features['engagement_bin'] = pd.qcut(inf_features['engagement'].clip(0, inf_features['engagement'].quantile(0.99)), 
                                          q=5, labels=False, duplicates='drop').fillna(0).astype(int)

# Create feature tags
def get_influencer_features(row):
    features = []
    features.append(f"network:{int(row['network_id'])}")
    features.append(f"tier:{int(row['tier_level'])}" if pd.notna(row['tier_level']) else "tier:unknown")
    features.append(f"followers_bin:{row['followers_bin']}")
    features.append(f"engagement_bin:{row['engagement_bin']}")
    return features

inf_features['feature_list'] = inf_features.apply(get_influencer_features, axis=1)

# Get all unique feature names
all_inf_feature_names = set()
for fl in inf_features['feature_list']:
    all_inf_feature_names.update(fl)

print(f"Unique influencer features: {len(all_inf_feature_names)}")

# Rebuild dataset with item features
dataset_with_features = Dataset()
dataset_with_features.fit(
    users=all_campaigns,
    items=all_influencers,
    item_features=all_inf_feature_names
)

# Build interactions again
(interactions_matrix_f, weights_f) = dataset_with_features.build_interactions(valid_interactions)

# Build item features matrix
item_features_list = []
for _, row in inf_features.iterrows():
    if row['id'] in influencer_id_map:
        item_features_list.append((row['id'], row['feature_list']))

item_features_matrix = dataset_with_features.build_item_features(item_features_list)

print(f"Item features matrix shape: {item_features_matrix.shape}")

In [None]:
# Train LightFM model
print("Training LightFM model...")
print("Using WARP loss (optimized for ranking)")

# Model with item features (hybrid mode)
lightfm_model = LightFM(
    no_components=64,      # Embedding dimension
    learning_rate=0.05,
    loss='warp',           # Weighted Approximate-Rank Pairwise - best for top-N
    random_state=42
)

# Train for multiple epochs
NUM_EPOCHS = 30

for epoch in range(NUM_EPOCHS):
    lightfm_model.fit_partial(
        interactions_matrix_f,
        item_features=item_features_matrix,
        epochs=1,
        num_threads=4
    )
    if (epoch + 1) % 10 == 0:
        print(f"  Epoch {epoch + 1}/{NUM_EPOCHS} complete")

print("LightFM model trained!")

# Get the updated mappings
lfm_campaign_map, _, lfm_influencer_map, _ = dataset_with_features.mapping()
lfm_influencer_reverse = {v: k for k, v in lfm_influencer_map.items()}

In [None]:
# LightFM recommendation function
def get_recommendations_lightfm(campaign_id, top_n=100):
    """
    Get recommendations using LightFM hybrid model.
    
    The model predicts scores for all influencers based on:
    1. Learned campaign embedding (from past acceptances)
    2. Learned influencer embeddings  
    3. Influencer features (for cold-start)
    """
    if campaign_id not in lfm_campaign_map:
        return None
    
    camp = campaigns[campaigns['id'] == campaign_id].iloc[0]
    network = camp['network_id']
    
    # Get eligible influencers
    eligible = get_eligible_influencers(campaign_id)
    
    # Filter to network and eligibility
    pool = influencers_scored[
        (influencers_scored['id'].isin(eligible)) &
        (influencers_scored['network_id'] == network)
    ].copy()
    
    if len(pool) == 0:
        return None
    
    # Get LightFM indices for eligible influencers
    campaign_idx = lfm_campaign_map[campaign_id]
    
    eligible_with_idx = []
    for inf_id in pool['id']:
        if inf_id in lfm_influencer_map:
            eligible_with_idx.append((inf_id, lfm_influencer_map[inf_id]))
    
    if not eligible_with_idx:
        return None
    
    inf_ids, inf_indices = zip(*eligible_with_idx)
    inf_indices = np.array(inf_indices)
    
    # Predict scores
    scores = lightfm_model.predict(
        user_ids=campaign_idx,
        item_ids=inf_indices,
        item_features=item_features_matrix
    )
    
    # Get top N
    top_indices = np.argsort(scores)[::-1][:top_n]
    top_inf_ids = [inf_ids[i] for i in top_indices]
    
    return top_inf_ids

# Test it
test_recs = get_recommendations_lightfm(2993, top_n=10)
print(f"LightFM recommendations for campaign 2993: {len(test_recs) if test_recs else 0} influencers")

In [None]:
# Evaluate LightFM vs all other approaches
print("=" * 70)
print("EVALUATION: ALL METHODS COMPARISON")
print("=" * 70)

results_all = {
    'history': [],
    'collab': [],
    'hybrid': [],
    'lightfm': []
}

for i, camp_id in enumerate(newest_test_camps):
    if (i + 1) % 25 == 0:
        print(f"  Processing campaign {i+1}/100...")
    
    actual_accepted = test_interactions[
        (test_interactions['campaign_id'] == camp_id) & 
        (test_interactions['accepted'] == 1)
    ]['creator_id'].unique()
    
    if len(actual_accepted) < 5:
        continue
    
    actual_set = set(actual_accepted)
    
    for top_n in [50, 100, 200]:
        # History baseline
        rec = get_recommendations_history(camp_id, top_n)
        if rec:
            hits = len(set(rec) & actual_set)
            results_all['history'].append({'top_n': top_n, 'hits': hits, 'actual': len(actual_set)})
        
        # Simple collaborative
        rec = get_recommendations_collaborative(camp_id, top_n)
        if rec:
            hits = len(set(rec) & actual_set)
            results_all['collab'].append({'top_n': top_n, 'hits': hits, 'actual': len(actual_set)})
        
        # Hybrid (history + collab)
        rec = get_recommendations_hybrid(camp_id, top_n)
        if rec:
            hits = len(set(rec) & actual_set)
            results_all['hybrid'].append({'top_n': top_n, 'hits': hits, 'actual': len(actual_set)})
        
        # LightFM
        rec = get_recommendations_lightfm(camp_id, top_n)
        if rec:
            hits = len(set(rec) & actual_set)
            results_all['lightfm'].append({'top_n': top_n, 'hits': hits, 'actual': len(actual_set)})

# Print results
print("\n" + "=" * 70)
print("RESULTS COMPARISON - ALL METHODS")
print("=" * 70)

method_names = {
    'history': 'HISTORY (recency-weighted)',
    'collab': 'COLLABORATIVE (campaign similarity)',
    'hybrid': 'HYBRID (history + collab)',
    'lightfm': 'LIGHTFM (matrix factorization)'
}

for method, name in method_names.items():
    print(f"\n{name}:")
    df = pd.DataFrame(results_all[method])
    for top_n in [50, 100, 200]:
        sub = df[df['top_n'] == top_n]
        if len(sub) > 0:
            recall = sub['hits'].sum() / sub['actual'].sum()
            print(f"  Top {top_n}: {recall*100:.1f}% recall ({sub['hits'].sum()} hits)")

# Summary table
print("\n" + "=" * 70)
print("SUMMARY TABLE (Recall %)")
print("=" * 70)
print(f"{'Method':<35} {'Top 50':>10} {'Top 100':>10} {'Top 200':>10}")
print("-" * 70)

for method, name in method_names.items():
    df = pd.DataFrame(results_all[method])
    recalls = []
    for top_n in [50, 100, 200]:
        sub = df[df['top_n'] == top_n]
        if len(sub) > 0:
            recall = sub['hits'].sum() / sub['actual'].sum() * 100
            recalls.append(f"{recall:.1f}%")
        else:
            recalls.append("N/A")
    print(f"{name:<35} {recalls[0]:>10} {recalls[1]:>10} {recalls[2]:>10}")

In [None]:
# FINAL PRODUCTION FUNCTION - Best of all approaches
def recommend_influencers_v3(campaign_id, top_n=100, method='ensemble'):
    """
    Production-ready influencer recommendation with all strategies.
    
    Methods:
    - 'history': Rank by past acceptance history
    - 'collaborative': Rank by acceptance in similar campaigns  
    - 'lightfm': LightFM matrix factorization model
    - 'ensemble': Combine all signals (recommended)
    
    Returns DataFrame with recommended influencers and scores.
    """
    camp = campaigns[campaigns['id'] == campaign_id].iloc[0]
    network = camp['network_id']
    network_name = 'Instagram' if network == 1 else 'TikTok' if network == 8 else 'Other'
    
    print(f"Campaign: {camp['name']}")
    print(f"Network: {network_name}")
    print(f"Method: {method}")
    
    # Get eligible pool
    eligible = get_eligible_influencers(campaign_id)
    print(f"Eligible (after group filters): {len(eligible)}")
    
    pool = influencers_scored[
        (influencers_scored['id'].isin(eligible)) &
        (influencers_scored['network_id'] == network)
    ].copy()
    print(f"On {network_name}: {len(pool)}")
    
    if len(pool) == 0:
        print("No eligible influencers!")
        return None
    
    # Exclude already contacted
    already = interactions[interactions['campaign_id'] == campaign_id]['creator_id'].unique()
    pool = pool[~pool['id'].isin(already)]
    print(f"After excluding contacted: {len(pool)}")
    
    # === SCORE 1: History ===
    pool = pool.merge(full_creator_history, left_on='id', right_on='creator_id', how='left')
    pool['history_score'] = pool['recency_score'].fillna(0)
    
    # === SCORE 2: Collaborative ===
    similar = get_similar_campaigns(campaign_id, top_k=20, exclude_future=False)
    pool['collab_score'] = 0.0
    
    if similar:
        similar_camp_ids = [s['campaign_id'] for s in similar]
        similarity_lookup = {s['campaign_id']: s['similarity'] for s in similar}
        
        similar_accepted = interactions[
            (interactions['campaign_id'].isin(similar_camp_ids)) &
            (interactions['accepted'] == 1)
        ].copy()
        
        if len(similar_accepted) > 0:
            similar_accepted['sim_weight'] = similar_accepted['campaign_id'].map(similarity_lookup)
            collab_scores = similar_accepted.groupby('creator_id')['sim_weight'].sum().reset_index()
            collab_scores.columns = ['id', 'collab_score_new']
            pool = pool.merge(collab_scores, on='id', how='left')
            pool['collab_score'] = pool['collab_score_new'].fillna(0)
            pool = pool.drop(columns=['collab_score_new'], errors='ignore')
    
    # === SCORE 3: LightFM ===
    pool['lightfm_score'] = 0.0
    
    if campaign_id in lfm_campaign_map:
        campaign_idx = lfm_campaign_map[campaign_id]
        
        for idx, row in pool.iterrows():
            if row['id'] in lfm_influencer_map:
                inf_idx = lfm_influencer_map[row['id']]
                score = lightfm_model.predict(
                    user_ids=campaign_idx,
                    item_ids=np.array([inf_idx]),
                    item_features=item_features_matrix
                )[0]
                pool.loc[idx, 'lightfm_score'] = score
    
    # Normalize all scores to 0-1
    for col in ['history_score', 'collab_score', 'lightfm_score']:
        max_val = pool[col].max()
        if max_val > 0:
            pool[f'{col}_norm'] = pool[col] / max_val
        else:
            pool[f'{col}_norm'] = 0
    
    # Calculate final score based on method
    if method == 'history':
        pool['final_score'] = pool['history_score_norm']
    elif method == 'collaborative':
        pool['final_score'] = pool['collab_score_norm']
    elif method == 'lightfm':
        pool['final_score'] = pool['lightfm_score_norm']
    else:  # ensemble
        pool['final_score'] = (
            pool['history_score_norm'] * 0.4 +
            pool['collab_score_norm'] * 0.3 +
            pool['lightfm_score_norm'] * 0.3
        )
    
    # Return top N
    result = pool.nlargest(top_n, 'final_score')[
        ['id', 'account', 'name', 'followers', 'engagement', 
         'times_accepted', 'history_score', 'collab_score', 'lightfm_score', 'final_score']
    ].copy()
    
    result = result.rename(columns={
        'times_accepted': 'past_acceptances'
    })
    
    return result

# Example
print("\n" + "=" * 70)
print("EXAMPLE: Ensemble recommendations for campaign 2993")
print("=" * 70)
recs = recommend_influencers_v3(2993, top_n=15, method='ensemble')
if recs is not None:
    print(recs.to_string(index=False))