# Quality-Focused Influencer Recommender System

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
print("Libraries loaded!")

Libraries loaded!


In [2]:
# Load data
DATA_PATH = '/home/vlad/Work/recommenders/Mydata/'
campaigns = pd.read_csv(f'{DATA_PATH}Campaigns.csv')
briefs = pd.read_csv(f'{DATA_PATH}Briefs.csv')
influencers = pd.read_csv(f'{DATA_PATH}Influencers.csv')
interactions = pd.read_csv(f'{DATA_PATH}Interactions.csv')
print(f"Loaded: {len(campaigns)} campaigns, {len(influencers)} influencers, {len(interactions)} interactions")

Loaded: 2383 campaigns, 63932 influencers, 366352 interactions


## Calculate Quality Score

In [3]:
# Calculate quality score for each influencer
def calculate_quality_score(df):
    result = df.copy()
    
    # Engagement (40%)
    eng_cap = result['engagement'].quantile(0.99)
    result['eng_norm'] = (result['engagement'].clip(0, eng_cap) / eng_cap).fillna(0)
    
    # Followers (25%) - log scale
    result['foll_log'] = np.log1p(result['followers'].fillna(0))
    foll_max = result['foll_log'].quantile(0.99)
    result['foll_norm'] = (result['foll_log'] / foll_max).clip(0, 1)
    
    # Avg likes (20%) - log scale
    result['likes_log'] = np.log1p(result['avg_likes'].fillna(0))
    likes_max = result['likes_log'].quantile(0.99)
    result['likes_norm'] = (result['likes_log'] / likes_max).clip(0, 1)
    
    # Avg comments (15%) - log scale
    result['comments_log'] = np.log1p(result['avg_comments'].fillna(0))
    comments_max = result['comments_log'].quantile(0.99)
    result['comments_norm'] = (result['comments_log'] / comments_max).clip(0, 1)
    
    # Weighted score (0-100)
    result['quality_score'] = (
        result['eng_norm'] * 40 +
        result['foll_norm'] * 25 +
        result['likes_norm'] * 20 +
        result['comments_norm'] * 15
    )
    return result

influencers_scored = calculate_quality_score(influencers)
print("Quality score distribution:")
print(influencers_scored['quality_score'].describe())
print("\nTop 10 quality influencers:")
print(influencers_scored.nlargest(10, 'quality_score')[['account', 'followers', 'engagement', 'avg_likes', 'quality_score']].to_string())

Quality score distribution:
count    63932.000000
mean        22.627401
std         12.603990
min          0.000000
25%         14.507045
50%         21.758675
75%         29.273299
max         95.994725
Name: quality_score, dtype: float64

Top 10 quality influencers:
                   account  followers  engagement  avg_likes  quality_score
34297          marcelasmin    99338.0      367.05   362113.0      95.994725
44618             maadaaaa    47523.0      282.80   134048.0      94.649420
44987          dumpnitrish    32343.0      300.80    97091.8      93.947291
28612  asawani_ngongomelon    25618.0      467.94   118832.0      93.521980
47941           xoxo.grldn    21790.0      322.94    70262.8      93.226687
30504  sean.marcus.leonida    20730.0      652.94   134476.0      93.135698
47202         isisalencary    18504.0      427.48    78317.8      92.928440
49276      urprettygalriri    15867.0     1928.44   305271.0      92.647928
32635           sadidevera    13701.0      515.

In [4]:
# Filter to quality influencers only
MIN_FOLLOWERS = 1000
MIN_ENGAGEMENT = 0.5

quality_influencers = influencers_scored[
    (influencers_scored['followers'] >= MIN_FOLLOWERS) &
    (influencers_scored['engagement'] >= MIN_ENGAGEMENT)
].copy()

print(f"Total: {len(influencers_scored)}, Quality: {len(quality_influencers)}, Filtered: {len(influencers_scored) - len(quality_influencers)}")

Total: 63932, Quality: 20349, Filtered: 43583


## Prepare Training Data

In [5]:
# Create accepted column and merge with quality influencers
interactions['accepted'] = (interactions['status'] == 2).astype(int)

df = interactions.merge(
    quality_influencers[['id', 'network_id', 'followers', 'follows', 'engagement',
                         'avg_likes', 'avg_comments', 'posts', 'reach', 'impressions',
                         'tier_level', 'quality_score']],
    left_on='creator_id', right_on='id', how='inner'
)
print(f"Interactions with quality influencers: {len(df)}")

Interactions with quality influencers: 239186


In [6]:
# Merge campaign data
camp_cols = campaigns[['id', 'type_id', 'description', 'private', 'pre_approve']].copy()
camp_cols = camp_cols.rename(columns={'id': 'camp_id', 'description': 'campaign_description'})
df = df.merge(camp_cols, left_on='campaign_id', right_on='camp_id', how='left')

# Merge brief data
brief_cols = briefs[['id', 'description']].copy()
brief_cols = brief_cols.rename(columns={'id': 'b_id', 'description': 'brief_description'})
df = df.merge(brief_cols, left_on='brief_id', right_on='b_id', how='left')

print(f"After merging: {len(df)} rows")

After merging: 239186 rows


In [7]:
# Fill missing values
numeric_cols = ['followers', 'follows', 'engagement', 'avg_likes', 'avg_comments',
                'posts', 'reach', 'impressions', 'quality_score']
cat_cols = ['network_id', 'type_id', 'private', 'pre_approve', 'tier_level']

for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(-1).astype(int)
print("Missing values filled")

Missing values filled


In [8]:
# TF-IDF on unique briefs (memory efficient)
df['brief_description'] = df['brief_description'].fillna('')
df['campaign_description'] = df['campaign_description'].fillna('')

unique_briefs = df[['brief_id', 'campaign_description', 'brief_description']].drop_duplicates('brief_id')
unique_briefs['text'] = unique_briefs['campaign_description'] + ' ' + unique_briefs['brief_description']

tfidf = TfidfVectorizer(max_features=50, stop_words='english')
tfidf_matrix = tfidf.fit_transform(unique_briefs['text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(50)])
tfidf_df['brief_id'] = unique_briefs['brief_id'].values

df = df.merge(tfidf_df, on='brief_id', how='left')
for i in range(50):
    df[f'tfidf_{i}'] = df[f'tfidf_{i}'].fillna(0)
print("TF-IDF features added")

TF-IDF features added


In [9]:
# Historical stats
creator_stats = interactions.groupby('creator_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
creator_stats.columns = ['creator_id', 'total_accepted', 'total_interactions', 'acceptance_rate']

campaign_stats = interactions.groupby('campaign_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
campaign_stats.columns = ['campaign_id', 'camp_accepted', 'camp_interactions', 'camp_acceptance_rate']

df = df.merge(creator_stats, on='creator_id', how='left')
df = df.merge(campaign_stats, on='campaign_id', how='left')

for col in ['acceptance_rate', 'total_accepted', 'total_interactions', 'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']:
    df[col] = df[col].fillna(0)
print(f"Dataset ready: {df.shape}")

Dataset ready: (239186, 87)


## Train Model

In [10]:
# Define features and train
tfidf_feats = [f'tfidf_{i}' for i in range(50)]
hist_feats = ['acceptance_rate', 'total_accepted', 'total_interactions',
              'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']

all_features = numeric_cols + cat_cols + tfidf_feats + hist_feats
all_features = [f for f in all_features if f in df.columns]

X = df[all_features].fillna(0)
y = df['accepted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

Train: 191348, Test: 47838


In [11]:
# Train LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
          'num_leaves': 31, 'learning_rate': 0.05, 'is_unbalance': True}

model = lgb.train(params, train_data, 300, valid_sets=[test_data],
                  callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)])

y_pred = model.predict(X_test)
print(f"\nROC-AUC: {roc_auc_score(y_test, y_pred):.4f}")

# Show network mapping
print("\n=== NETWORK MAPPING ===")
print("Network 1: Instagram")
print("Network 8: TikTok")
print("Network 9: Other")

Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.96337
[200]	valid_0's auc: 0.964519
[300]	valid_0's auc: 0.964936
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.964936

ROC-AUC: 0.9649

=== NETWORK MAPPING ===
Network 1: Instagram
Network 8: TikTok
Network 9: Other


## Campaign-Aware Recommendation Functions

Key insight: Different campaigns target different networks (Instagram vs TikTok) and have zero overlap in successful influencers!

In [12]:
def recommend_for_campaign(campaign_id, brief_id, top_n=10,
                           min_followers=1000, min_engagement=1.0,
                           quality_weight=0.7, acceptance_weight=0.3):
    """
    Campaign-aware recommendations that:
    1. Filter by campaign's network (Instagram/TikTok)
    2. Use text similarity between influencer bio and campaign description
    3. Weight quality + acceptance probability
    """
    camp_info = campaigns[campaigns['id'] == campaign_id].iloc[0]
    brief_info = briefs[briefs['id'] == brief_id].iloc[0]
    
    campaign_network = camp_info['network_id']
    campaign_text = str(camp_info.get('description', '')) + ' ' + str(brief_info.get('description', ''))
    
    print(f"Campaign: {camp_info['name']}")
    print(f"Network: {campaign_network} ({'Instagram' if campaign_network == 1 else 'TikTok' if campaign_network == 8 else 'Other'})") 
    
    # Start with quality influencers
    pred_df = influencers_scored[
        (influencers_scored['followers'] >= min_followers) &
        (influencers_scored['engagement'] >= min_engagement)
    ].copy()
    
    # IMPORTANT: Filter by campaign network!
    pred_df = pred_df[pred_df['network_id'] == campaign_network]
    print(f"Influencers on this network: {len(pred_df)}")
    
    # Exclude already interacted
    already = interactions[interactions['campaign_id'] == campaign_id]['creator_id'].unique()
    pred_df = pred_df[~pred_df['id'].isin(already)]
    print(f"After excluding already contacted: {len(pred_df)}")
    
    if len(pred_df) == 0:
        print("No influencers available!")
        return None
    
    # Add campaign features for model
    pred_df['campaign_id'] = campaign_id
    pred_df['type_id'] = camp_info.get('type_id', -1)
    pred_df['private'] = camp_info.get('private', 0)
    pred_df['pre_approve'] = camp_info.get('pre_approve', 0)
    
    # TF-IDF features from campaign text
    tfidf_arr = tfidf.transform([campaign_text]).toarray()[0]
    for i in range(50):
        pred_df[f'tfidf_{i}'] = tfidf_arr[i]
    
    # Calculate text similarity between influencer bio and campaign
    pred_df['biography'] = pred_df['biography'].fillna('')
    bio_tfidf = tfidf.transform(pred_df['biography'].tolist())
    campaign_tfidf = tfidf.transform([campaign_text])
    
    # Cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity
    similarities = cosine_similarity(bio_tfidf, campaign_tfidf).flatten()
    pred_df['bio_similarity'] = similarities
    
    # Add historical features
    pred_df = pred_df.merge(creator_stats, left_on='id', right_on='creator_id', how='left')
    pred_df = pred_df.merge(campaign_stats, on='campaign_id', how='left')
    
    # Fill missing features
    for col in all_features:
        if col not in pred_df.columns:
            pred_df[col] = 0
        pred_df[col] = pred_df[col].fillna(0)
    
    # Predict acceptance probability
    pred_df['acceptance_prob'] = model.predict(pred_df[all_features])
    
    # Normalize scores
    pred_df['quality_norm'] = pred_df['quality_score'] / pred_df['quality_score'].max()
    pred_df['similarity_norm'] = pred_df['bio_similarity'] / (pred_df['bio_similarity'].max() + 0.001)
    
    # Combined score: quality + acceptance + bio similarity
    pred_df['final_score'] = (
        pred_df['quality_norm'] * quality_weight * 0.6 +
        pred_df['acceptance_prob'] * acceptance_weight +
        pred_df['similarity_norm'] * quality_weight * 0.4  # Bio match is part of quality
    )
    
    result = pred_df.nlargest(top_n, 'final_score')[
        ['id', 'account', 'name', 'followers', 'engagement', 'avg_likes', 
         'quality_score', 'bio_similarity', 'acceptance_prob', 'final_score']
    ].copy()
    
    return result

In [13]:
# Remove old function - we'll use the new campaign-aware one
pass

## Proper Evaluation: Train on Old Campaigns, Test on New

We'll:
1. Train on campaigns before April 2024
2. Test on campaigns after April 2024
3. For each test campaign, recommend top N influencers
4. Measure how many of our recommendations match the actual accepted influencers

In [14]:
# Split campaigns by date
campaigns['created_at'] = pd.to_datetime(campaigns['created_at'])
SPLIT_DATE = '2024-04-01'

train_campaigns = campaigns[campaigns['created_at'] < SPLIT_DATE]['id'].tolist()
test_campaigns = campaigns[campaigns['created_at'] >= SPLIT_DATE]['id'].tolist()

# Get interactions for train/test
train_interactions = interactions[interactions['campaign_id'].isin(train_campaigns)].copy()
test_interactions = interactions[interactions['campaign_id'].isin(test_campaigns)].copy()

print(f"Training campaigns: {len(train_campaigns)}")
print(f"Test campaigns: {len(test_campaigns)}")
print(f"Training interactions: {len(train_interactions)}")
print(f"Test interactions: {len(test_interactions)}")

# Filter test to campaigns with at least 5 accepted
test_accepted = test_interactions[test_interactions['accepted'] == 1].groupby('campaign_id').size()
test_camps_with_accepted = test_accepted[test_accepted >= 5].index.tolist()
print(f"\nTest campaigns with 5+ accepted influencers: {len(test_camps_with_accepted)}")

Training campaigns: 1029
Test campaigns: 1354
Training interactions: 139192
Test interactions: 227160

Test campaigns with 5+ accepted influencers: 677


In [15]:
# Build training data using ONLY training campaigns
# Historical stats from training data only
train_creator_stats = train_interactions.groupby('creator_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
train_creator_stats.columns = ['creator_id', 'total_accepted', 'total_interactions', 'acceptance_rate']

train_campaign_stats = train_interactions.groupby('campaign_id')['accepted'].agg(['sum', 'count', 'mean']).reset_index()
train_campaign_stats.columns = ['campaign_id', 'camp_accepted', 'camp_interactions', 'camp_acceptance_rate']

# Build training dataset
train_df = train_interactions.merge(
    quality_influencers[['id', 'network_id', 'followers', 'follows', 'engagement',
                         'avg_likes', 'avg_comments', 'posts', 'reach', 'impressions',
                         'tier_level', 'quality_score', 'biography']],
    left_on='creator_id', right_on='id', how='inner'
)

# Merge campaign data
camp_cols = campaigns[['id', 'type_id', 'description', 'private', 'pre_approve', 'network_id']].copy()
camp_cols = camp_cols.rename(columns={'id': 'camp_id', 'description': 'campaign_description', 'network_id': 'camp_network'})
train_df = train_df.merge(camp_cols, left_on='campaign_id', right_on='camp_id', how='left')

# Merge brief data
brief_cols = briefs[['id', 'description']].copy()
brief_cols = brief_cols.rename(columns={'id': 'b_id', 'description': 'brief_description'})
train_df = train_df.merge(brief_cols, left_on='brief_id', right_on='b_id', how='left')

print(f"Training data: {len(train_df)} rows")

Training data: 103565 rows


In [16]:
# Prepare features for training
train_df['brief_description'] = train_df['brief_description'].fillna('')
train_df['campaign_description'] = train_df['campaign_description'].fillna('')

# Fill numeric/categorical
for col in numeric_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(0)
for col in cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(-1).astype(int)

# TF-IDF on training briefs only
train_unique_briefs = train_df[['brief_id', 'campaign_description', 'brief_description']].drop_duplicates('brief_id')
train_unique_briefs['text'] = train_unique_briefs['campaign_description'] + ' ' + train_unique_briefs['brief_description']

train_tfidf = TfidfVectorizer(max_features=50, stop_words='english')
train_tfidf_matrix = train_tfidf.fit_transform(train_unique_briefs['text'])

tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(50)])
tfidf_df['brief_id'] = train_unique_briefs['brief_id'].values

train_df = train_df.merge(tfidf_df, on='brief_id', how='left')
for i in range(50):
    train_df[f'tfidf_{i}'] = train_df[f'tfidf_{i}'].fillna(0)

# Add historical stats
train_df = train_df.merge(train_creator_stats, on='creator_id', how='left')
train_df = train_df.merge(train_campaign_stats, on='campaign_id', how='left')
for col in ['acceptance_rate', 'total_accepted', 'total_interactions', 'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']:
    train_df[col] = train_df[col].fillna(0)

print(f"Training features prepared: {train_df.shape}")

Training features prepared: (103565, 89)


In [17]:
# Train model on training data only
tfidf_feats = [f'tfidf_{i}' for i in range(50)]
hist_feats = ['acceptance_rate', 'total_accepted', 'total_interactions',
              'camp_acceptance_rate', 'camp_accepted', 'camp_interactions']

train_features = numeric_cols + cat_cols + tfidf_feats + hist_feats
train_features = [f for f in train_features if f in train_df.columns]

X_train = train_df[train_features].fillna(0)
y_train = train_df['accepted']

print(f"Training on {len(X_train)} samples with {len(train_features)} features")

# Train
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
          'num_leaves': 31, 'learning_rate': 0.05, 'is_unbalance': True}

eval_model = lgb.train(params, train_data, 200)
print("Model trained on historical data only!")

Training on 103565 samples with 70 features
Model trained on historical data only!


In [18]:
# Evaluation function: For a test campaign, recommend top N and check overlap with actual accepted
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations_for_eval(campaign_id, brief_id, top_n=50):
    """Get top N recommendations for a campaign using the trained model."""
    camp_info = campaigns[campaigns['id'] == campaign_id].iloc[0]
    brief_info = briefs[briefs['id'] == brief_id]
    if len(brief_info) == 0:
        return None
    brief_info = brief_info.iloc[0]
    
    campaign_network = camp_info['network_id']
    campaign_text = str(camp_info.get('description', '')) + ' ' + str(brief_info.get('description', ''))
    
    # Get quality influencers on same network
    pred_df = influencers_scored[
        (influencers_scored['followers'] >= 1000) &
        (influencers_scored['engagement'] >= 0.5) &
        (influencers_scored['network_id'] == campaign_network)
    ].copy()
    
    if len(pred_df) == 0:
        return None
    
    # Add features
    pred_df['campaign_id'] = campaign_id
    pred_df['type_id'] = camp_info.get('type_id', -1)
    pred_df['private'] = camp_info.get('private', 0)
    pred_df['pre_approve'] = camp_info.get('pre_approve', 0)
    
    # TF-IDF
    try:
        tfidf_arr = train_tfidf.transform([campaign_text]).toarray()[0]
        for i in range(50):
            pred_df[f'tfidf_{i}'] = tfidf_arr[i]
    except:
        for i in range(50):
            pred_df[f'tfidf_{i}'] = 0
    
    # Historical stats (from training data only!)
    pred_df = pred_df.merge(train_creator_stats, left_on='id', right_on='creator_id', how='left')
    pred_df = pred_df.merge(train_campaign_stats, on='campaign_id', how='left')
    
    # Fill missing
    for col in train_features:
        if col not in pred_df.columns:
            pred_df[col] = 0
        pred_df[col] = pred_df[col].fillna(0)
    
    # Predict
    pred_df['score'] = eval_model.predict(pred_df[train_features])
    
    # Add quality score to final ranking
    pred_df['quality_norm'] = pred_df['quality_score'] / pred_df['quality_score'].max()
    pred_df['final_score'] = pred_df['score'] * 0.5 + pred_df['quality_norm'] * 0.5
    
    return pred_df.nlargest(top_n, 'final_score')['id'].tolist()

In [19]:
# Evaluate on test campaigns
results = []

for camp_id in test_camps_with_accepted[:100]:  # Test on first 100 campaigns
    # Get actual accepted influencers
    actual_accepted = test_interactions[
        (test_interactions['campaign_id'] == camp_id) & 
        (test_interactions['accepted'] == 1)
    ]['creator_id'].unique()
    
    if len(actual_accepted) < 5:
        continue
    
    # Get brief
    camp_briefs = briefs[briefs['campaign_id'] == camp_id]
    if len(camp_briefs) == 0:
        continue
    brief_id = camp_briefs.iloc[0]['id']
    
    # Get recommendations
    for top_n in [10, 25, 50, 100]:
        recommended = get_recommendations_for_eval(camp_id, brief_id, top_n=top_n)
        if recommended is None:
            continue
        
        # Calculate metrics
        recommended_set = set(recommended)
        actual_set = set(actual_accepted)
        
        hits = len(recommended_set & actual_set)
        precision = hits / len(recommended_set) if len(recommended_set) > 0 else 0
        recall = hits / len(actual_set) if len(actual_set) > 0 else 0
        
        results.append({
            'campaign_id': camp_id,
            'top_n': top_n,
            'actual_accepted': len(actual_set),
            'recommended': len(recommended_set),
            'hits': hits,
            'precision': precision,
            'recall': recall
        })

results_df = pd.DataFrame(results)
print(f"Evaluated {results_df['campaign_id'].nunique()} test campaigns")

Evaluated 99 test campaigns


In [20]:
# Show evaluation results
print("=" * 70)
print("EVALUATION RESULTS: Model trained on old campaigns, tested on new")
print("=" * 70)

# Aggregate by top_n
summary = results_df.groupby('top_n').agg({
    'hits': 'sum',
    'actual_accepted': 'sum',
    'recommended': 'sum',
    'precision': 'mean',
    'recall': 'mean',
    'campaign_id': 'count'
}).rename(columns={'campaign_id': 'num_campaigns'})

summary['overall_precision'] = summary['hits'] / summary['recommended']
summary['overall_recall'] = summary['hits'] / summary['actual_accepted']

print("\n=== SUMMARY BY TOP_N ===")
print(summary[['num_campaigns', 'hits', 'actual_accepted', 'overall_precision', 'overall_recall']].to_string())

print("\n\n=== INTERPRETATION ===")
for top_n in [10, 25, 50, 100]:
    row = summary.loc[top_n]
    print(f"Top {top_n}: {row['overall_recall']*100:.1f}% of actual accepted influencers found in recommendations")

EVALUATION RESULTS: Model trained on old campaigns, tested on new

=== SUMMARY BY TOP_N ===
       num_campaigns  hits  actual_accepted  overall_precision  overall_recall
top_n                                                                         
10                99     0             2024           0.000000        0.000000
25                99     1             2024           0.000404        0.000494
50                99     1             2024           0.000202        0.000494
100               99     2             2024           0.000202        0.000988


=== INTERPRETATION ===
Top 10: 0.0% of actual accepted influencers found in recommendations
Top 25: 0.0% of actual accepted influencers found in recommendations
Top 50: 0.0% of actual accepted influencers found in recommendations
Top 100: 0.1% of actual accepted influencers found in recommendations


In [21]:
# Cell 26: Diagnose why recommendations don't match
sample_camp = test_camps_with_accepted[0]
camp_briefs = briefs[briefs['campaign_id'] == sample_camp]
brief_id = camp_briefs.iloc[0]['id']

# Who actually accepted
actual = test_interactions[
  (test_interactions['campaign_id'] == sample_camp) &
  (test_interactions['accepted'] == 1)
]['creator_id'].unique()

print(f"Campaign {sample_camp}")
print(f"Actual accepted: {len(actual)}")

# Check if actual accepted are in quality_influencers
actual_in_quality = influencers_scored[influencers_scored['id'].isin(actual)]
print(f"Accepted influencers that meet quality threshold: {len(actual_in_quality)}")

# Check their quality scores
if len(actual_in_quality) > 0:
  print(f"\nActual accepted influencers stats:")
  print(f"  Followers range: {actual_in_quality['followers'].min():.0f} - {actual_in_quality['followers'].max():.0f}")
  print(f"  Engagement range: {actual_in_quality['engagement'].min():.2f} - {actual_in_quality['engagement'].max():.2f}")
  print(f"  Quality score range: {actual_in_quality['quality_score'].min():.1f} - {actual_in_quality['quality_score'].max():.1f}")

# How many pass our filters?
filtered = actual_in_quality[
  (actual_in_quality['followers'] >= 1000) &
  (actual_in_quality['engagement'] >= 0.5)
]
print(f"  Pass our filters (1000+ followers, 0.5+ engagement): {len(filtered)}")

# Network check
camp_network = campaigns[campaigns['id'] == sample_camp]['network_id'].values[0]
print(f"\nCampaign network: {camp_network}")
print(f"Accepted influencers on this network: {len(actual_in_quality[actual_in_quality['network_id'] == camp_network])}")


IndentationError: unexpected indent (1602957946.py, line 2)