In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load JSON files
df_activities = pd.read_json("activities.json")
df_likes = pd.read_json("likes.json")
df_views = pd.read_json("views.json")
df_comments = pd.read_json("comments.json")

# Fill missing captions
df_activities["caption"] = df_activities["caption"].fillna("")

print("✅ Data loaded successfully!")
print(f"Activities: {len(df_activities)}")
print(f"Likes: {len(df_likes)}, Views: {len(df_views)}, Comments: {len(df_comments)}")

✅ Data loaded successfully!
Activities: 408
Likes: 36, Views: 353, Comments: 23


In [3]:
# Calculate engagement scores (Views=1, Likes=2, Comments=3)
engagement_scores = {}

for _, row in df_views.iterrows():
    activity_id = row['activity_id']
    engagement_scores[activity_id] = engagement_scores.get(activity_id, 0) + 1

for _, row in df_likes.iterrows():
    activity_id = row['activity_id']
    engagement_scores[activity_id] = engagement_scores.get(activity_id, 0) + 2

for _, row in df_comments.iterrows():
    activity_id = row['activity_id']
    engagement_scores[activity_id] = engagement_scores.get(activity_id, 0) + 3

# Find activities user hasn't seen
engaged_activity_ids = set(engagement_scores.keys())
all_activity_ids = set(df_activities['id'])
unseen_activities = all_activity_ids - engaged_activity_ids

print(f"✅ User engaged with: {len(engaged_activity_ids)} activities")
print(f"✅ Unseen activities to recommend: {len(unseen_activities)}")

✅ User engaged with: 358 activities
✅ Unseen activities to recommend: 50


In [4]:
# Create features for each activity
# We'll use: owner_id (encoded), is_public, and activity age

# Convert created_at to datetime
df_activities['created_at'] = pd.to_datetime(df_activities['created_at'])

# Calculate activity age in days
latest_date = df_activities['created_at'].max()
df_activities['age_days'] = (latest_date - df_activities['created_at']).dt.days

# Encode owner_id as numeric (one-hot would be too sparse)
df_activities['owner_encoded'] = pd.factorize(df_activities['owner_id'])[0]

# Convert is_public to numeric
df_activities['is_public_num'] = df_activities['is_public'].astype(int)

# Create feature matrix
feature_columns = ['owner_encoded', 'is_public_num', 'age_days']
X = df_activities[feature_columns].values

print("✅ Feature matrix created!")
print(f"Shape: {X.shape} (408 activities × 3 features)")
print(f"\nFeature preview for first activity:")
print(f"  Owner: {X[0][0]}, Public: {X[0][1]}, Age: {X[0][2]} days")

✅ Feature matrix created!
Shape: (408, 3) (408 activities × 3 features)

Feature preview for first activity:
  Owner: 0, Public: 0, Age: 2 days


In [5]:
# Normalize features so they're on the same scale (important for ML!)
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Calculate similarity between ALL activities using ML
similarity_matrix = cosine_similarity(X_normalized)

print("✅ ML Model trained!")
print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"\nThis matrix contains similarity scores between every pair of activities")
print(f"Values range from -1 (opposite) to 1 (identical)")
print(f"\nExample: Activity 0 vs Activity 1 similarity = {similarity_matrix[0][1]:.4f}")

✅ ML Model trained!
Similarity matrix shape: (408, 408)

This matrix contains similarity scores between every pair of activities
Values range from -1 (opposite) to 1 (identical)

Example: Activity 0 vs Activity 1 similarity = 0.9833


In [6]:
# Build user profile: average of activities they engaged with (weighted by score)
user_liked_indices = []
weights = []

for activity_id, score in engagement_scores.items():
    idx = df_activities[df_activities['id'] == activity_id].index
    if len(idx) > 0:
        user_liked_indices.append(idx[0])
        weights.append(score)

# Create weighted user profile vector
user_profile = np.average(X_normalized[user_liked_indices], axis=0, weights=weights)

# Calculate similarity between user profile and ALL activities
user_similarities = cosine_similarity([user_profile], X_normalized)[0]

# Add similarity scores to dataframe
df_activities['similarity_score'] = user_similarities

# Filter to unseen activities and sort by similarity
recommendations = df_activities[df_activities['id'].isin(unseen_activities)].copy()
recommendations = recommendations.sort_values('similarity_score', ascending=False)

print("✅ Recommendations generated using ML!")
print(f"\nTop 10 Personalized Recommendations:")
print(recommendations[['id', 'owner_id', 'age_days', 'similarity_score']].head(10))

✅ Recommendations generated using ML!

Top 10 Personalized Recommendations:
                           id                              owner_id  age_days  \
334  deb1210e19bf1abc6dbcb7b9  aa026a56-e1e4-470f-a489-6d13ee9f157b        12   
126  64b8da8303157bd2578388ab  63b5e5c9-751d-490b-ac2f-5d0f868f5871        12   
112  542111b3e6e6a45205c5d5fc  ccffb4b6-aedc-472b-a044-0a22675d0bc5         1   
221  ada33abdedbf73c7af28cb18  aa026a56-e1e4-470f-a489-6d13ee9f157b         2   
215  aae5c2fb23b9f96de8dfaf3e  63b5e5c9-751d-490b-ac2f-5d0f868f5871        16   
273  c9c49cf69dc16adbd21ae5cd  f45d3ce2-617a-4ccb-a10d-1c02aff5acac        18   
285  cd6d1d6aabbbda926fd0f09a  f64590e0-e860-420c-b8f5-b4fd793be28c        21   
404  ff9c3ffdfade33bdbb0592c1  7ef8fb58-87cd-4c43-aff6-b0ad215e88d3         0   
194  a1ff436b6defb124e5933fc2  06f14842-74cc-4b4e-9b08-042d232b0c84        24   
262  c68cbbc22eb30f9cd28abef0  676be20d-406f-48a2-a9d3-49f3acb92db4        14   

     similarity_score  
334     

In [7]:
def get_recommendations(user_id, top_n=10):
    """
    Get top N recommendations for a user using Content-Based Filtering
    
    Returns: DataFrame with recommended activities and similarity scores
    """
    # In production, you'd filter by user_id from your database
    # For now, we return the top recommendations we already calculated
    
    return recommendations[['id', 'owner_id', 'age_days', 'similarity_score', 'is_public']].head(top_n)

# Test it
print("Testing recommendation function:")
result = get_recommendations(user_id="63b5e5c9-751d-490b-ac2f-5d0f868f5871", top_n=5)
print(result)

Testing recommendation function:
                           id                              owner_id  age_days  \
334  deb1210e19bf1abc6dbcb7b9  aa026a56-e1e4-470f-a489-6d13ee9f157b        12   
126  64b8da8303157bd2578388ab  63b5e5c9-751d-490b-ac2f-5d0f868f5871        12   
112  542111b3e6e6a45205c5d5fc  ccffb4b6-aedc-472b-a044-0a22675d0bc5         1   
221  ada33abdedbf73c7af28cb18  aa026a56-e1e4-470f-a489-6d13ee9f157b         2   
215  aae5c2fb23b9f96de8dfaf3e  63b5e5c9-751d-490b-ac2f-5d0f868f5871        16   

     similarity_score  is_public  
334          0.965643       True  
126          0.797390       True  
112          0.788086       True  
221          0.781951       True  
215          0.776394       True  


In [8]:
def get_recommendations_v2(df_activities, df_likes, df_views, df_comments, user_id, top_n=10):
    """
    Get personalized recommendations for a user
    
    Parameters:
    - df_activities: DataFrame of ALL activities in the system
    - df_likes: DataFrame of user's likes
    - df_views: DataFrame of user's views
    - df_comments: DataFrame of user's comments
    - user_id: The user to generate recommendations for
    - top_n: Number of recommendations to return
    
    Returns: DataFrame with top N recommended activities
    """
    
    # 1. Calculate engagement scores
    engagement_scores = {}
    for _, row in df_views.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 1
    for _, row in df_likes.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 2
    for _, row in df_comments.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 3
    
    # 2. Prepare activity features
    df_act = df_activities.copy()
    df_act['created_at'] = pd.to_datetime(df_act['created_at'])
    latest_date = df_act['created_at'].max()
    df_act['age_days'] = (latest_date - df_act['created_at']).dt.days
    df_act['owner_encoded'] = pd.factorize(df_act['owner_id'])[0]
    df_act['is_public_num'] = df_act['is_public'].astype(int)
    
    # 3. Create feature matrix and normalize
    X = df_act[['owner_encoded', 'is_public_num', 'age_days']].values
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    
    # 4. Build user profile
    user_liked_indices = []
    weights = []
    for activity_id, score in engagement_scores.items():
        idx = df_act[df_act['id'] == activity_id].index
        if len(idx) > 0:
            user_liked_indices.append(idx[0])
            weights.append(score)
    
    if len(user_liked_indices) == 0:
        # User has no interactions, return newest activities
        return df_act.sort_values('created_at', ascending=False).head(top_n)
    
    user_profile = np.average(X_normalized[user_liked_indices], axis=0, weights=weights)
    
    # 5. Calculate similarities
    user_similarities = cosine_similarity([user_profile], X_normalized)[0]
    df_act['similarity_score'] = user_similarities
    
    # 6. Filter unseen activities and return top N
    engaged_ids = set(engagement_scores.keys())
    unseen = df_act[~df_act['id'].isin(engaged_ids)]
    recommendations = unseen.sort_values('similarity_score', ascending=False).head(top_n)
    
    return recommendations[['id', 'owner_id', 'age_days', 'similarity_score', 'is_public']]

# Test the new function
print("Testing new flexible function:")
result = get_recommendations_v2(df_activities, df_likes, df_views, df_comments, 
                                 user_id="63b5e5c9-751d-490b-ac2f-5d0f868f5871", 
                                 top_n=5)
print(result)

Testing new flexible function:
                           id                              owner_id  age_days  \
334  deb1210e19bf1abc6dbcb7b9  aa026a56-e1e4-470f-a489-6d13ee9f157b        12   
126  64b8da8303157bd2578388ab  63b5e5c9-751d-490b-ac2f-5d0f868f5871        12   
112  542111b3e6e6a45205c5d5fc  ccffb4b6-aedc-472b-a044-0a22675d0bc5         1   
221  ada33abdedbf73c7af28cb18  aa026a56-e1e4-470f-a489-6d13ee9f157b         2   
215  aae5c2fb23b9f96de8dfaf3e  63b5e5c9-751d-490b-ac2f-5d0f868f5871        16   

     similarity_score  is_public  
334          0.965643       True  
126          0.797390       True  
112          0.788086       True  
221          0.781951       True  
215          0.776394       True  


In [9]:
# Save the function to a Python file
code = '''
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

def get_recommendations(df_activities, df_likes, df_views, df_comments, user_id, top_n=10):
    """
    Get personalized recommendations for a user using Content-Based Filtering
    
    Parameters:
    - df_activities: DataFrame of ALL activities in the system
    - df_likes: DataFrame of user's likes
    - df_views: DataFrame of user's views
    - df_comments: DataFrame of user's comments
    - user_id: The user to generate recommendations for
    - top_n: Number of recommendations to return
    
    Returns: DataFrame with top N recommended activities
    """
    
    # 1. Calculate engagement scores
    engagement_scores = {}
    for _, row in df_views.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 1
    for _, row in df_likes.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 2
    for _, row in df_comments.iterrows():
        engagement_scores[row['activity_id']] = engagement_scores.get(row['activity_id'], 0) + 3
    
    # 2. Prepare activity features
    df_act = df_activities.copy()
    df_act['created_at'] = pd.to_datetime(df_act['created_at'])
    latest_date = df_act['created_at'].max()
    df_act['age_days'] = (latest_date - df_act['created_at']).dt.days
    df_act['owner_encoded'] = pd.factorize(df_act['owner_id'])[0]
    df_act['is_public_num'] = df_act['is_public'].astype(int)
    
    # 3. Create feature matrix and normalize
    X = df_act[['owner_encoded', 'is_public_num', 'age_days']].values
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    
    # 4. Build user profile
    user_liked_indices = []
    weights = []
    for activity_id, score in engagement_scores.items():
        idx = df_act[df_act['id'] == activity_id].index
        if len(idx) > 0:
            user_liked_indices.append(idx[0])
            weights.append(score)
    
    if len(user_liked_indices) == 0:
        # User has no interactions, return newest activities
        return df_act.sort_values('created_at', ascending=False).head(top_n)
    
    user_profile = np.average(X_normalized[user_liked_indices], axis=0, weights=weights)
    
    # 5. Calculate similarities
    user_similarities = cosine_similarity([user_profile], X_normalized)[0]
    df_act['similarity_score'] = user_similarities
    
    # 6. Filter unseen activities and return top N
    engaged_ids = set(engagement_scores.keys())
    unseen = df_act[~df_act['id'].isin(engaged_ids)]
    recommendations = unseen.sort_values('similarity_score', ascending=False).head(top_n)
    
    return recommendations[['id', 'owner_id', 'age_days', 'similarity_score', 'is_public']]
'''

# Save to file
with open('recommender.py', 'w') as f:
    f.write(code)

print("✅ Saved to recommender.py")
print("You can now import this in FastAPI: from recommender import get_recommendations")

✅ Saved to recommender.py
You can now import this in FastAPI: from recommender import get_recommendations
