
# AMBRIX â€” Interest-based Content Recommendation (Runnable Notebook)
This notebook loads `Users.csv`, `Posts.csv`, and `Engagements.csv` and implements a simple content+engagement based recommender.
It includes:
- Data loading & inspection
- A content-based scoring function
- Top-3 recommendations per user (excluding already seen posts)
- Simple offline evaluation: Precision@3 and Recall@3 (approximate)
- Saving outputs (recommendations CSV and a short PDF report)

**Run this notebook in Colab or locally.** Ensure the three CSVs are in the same folder as the notebook or update paths below.


In [1]:

# Imports
import pandas as pd
import numpy as np
from pathlib import Path
print("pandas version:", pd.__version__)
DATA_DIR = Path(".")
users = pd.read_csv(DATA_DIR / "Users.csv")
posts = pd.read_csv(DATA_DIR / "Posts.csv")
engagements = pd.read_csv(DATA_DIR / "Engagements.csv")
print("Loaded: users={}, posts={}, engagements={}".format(len(users), len(posts), len(engagements)))
users.head(), posts.head(), engagements.head()


pandas version: 2.1.2
Loaded: users=50, posts=100, engagements=1000


(  user_id  age gender          top_3_interests  past_engagement_score
 0      U1   24      F      sports, art, gaming                   0.61
 1      U2   32      F    travel, food, fashion                   0.93
 2      U3   28  Other  sports, travel, fashion                   0.40
 3      U4   25      M     fashion, music, tech                   0.53
 4      U5   24      M   fashion, food, fitness                   0.80,
   post_id creator_id content_type            tags
 0      P1        U44        video    sports, food
 1      P2        U26        video   music, travel
 2      P3        U32         text  sports, travel
 3      P4         U6        image   music, gaming
 4      P5        U32        image   food, fashion,
   user_id post_id  engagement
 0      U1     P52           1
 1      U1     P44           0
 2      U1      P1           1
 3      U1      P4           1
 4      U1     P65           0)

In [2]:

# Preprocessing: normalize tag strings and interests
def split_and_clean(s):
    if pd.isna(s):
        return []
    return [t.strip().lower() for t in str(s).split(",") if t.strip()!='']

users['interest_list'] = users['top_3_interests'].apply(split_and_clean)
posts['tag_list'] = posts['tags'].apply(split_and_clean)
# Ensure post_id and user_id are strings
users['user_id'] = users['user_id'].astype(str)
posts['post_id'] = posts['post_id'].astype(str)
engagements['user_id'] = engagements['user_id'].astype(str)
engagements['post_id'] = engagements['post_id'].astype(str)
users.head(2)


Unnamed: 0,user_id,age,gender,top_3_interests,past_engagement_score,interest_list
0,U1,24,F,"sports, art, gaming",0.61,"[sports, art, gaming]"
1,U2,32,F,"travel, food, fashion",0.93,"[travel, food, fashion]"


In [3]:

# Recommendation function (content-based + content-type weight)
def recommend_for_user(user_row, posts_df, engagements_df, top_k=3):
    user_interests = set(user_row['interest_list'])
    # exclude already engaged posts
    seen = set(engagements_df[engagements_df['user_id']==user_row['user_id']]['post_id'])
    candidates = posts_df[~posts_df['post_id'].isin(seen)].copy()
    # compute interest match count
    candidates['interest_matches'] = candidates['tag_list'].apply(lambda tags: len(set(tags) & user_interests))
    # content type score mapping
    ctype_weight = {'video':1.0, 'image':0.8, 'text':0.6}
    candidates['ctype_score'] = candidates['content_type'].map(lambda x: ctype_weight.get(str(x).lower(),0.5))
    # final score
    candidates['score'] = candidates['interest_matches'] + 0.2 * candidates['ctype_score']
    # sort and return top_k
    return candidates.sort_values('score', ascending=False).head(top_k)[['post_id','tags','content_type','score']]
# Example
recommend_for_user(users.iloc[0], posts, engagements).head()


Unnamed: 0,post_id,tags,content_type,score
77,P78,"sports, art",video,2.2
21,P22,"sports, art",audio,2.1
45,P46,"food, sports",video,1.2


In [4]:

# Generate recommendations for all users and save to CSV
all_recs = []
for _, u in users.iterrows():
    recs = recommend_for_user(u, posts, engagements, top_k=3)
    for rank, (_, r) in enumerate(recs.iterrows(), start=1):
        all_recs.append({'user_id': u['user_id'], 'rank': rank, 'post_id': r['post_id'], 'tags': r['tags'], 'content_type': r['content_type'], 'score': r['score']})

recs_df = pd.DataFrame(all_recs)
recs_df.to_csv("AMBRIX_recommendations.csv", index=False)
print("Saved recommendations to AMBRIX_recommendations.csv; sample:")
recs_df.head(10)


Saved recommendations to AMBRIX_recommendations.csv; sample:


Unnamed: 0,user_id,rank,post_id,tags,content_type,score
0,U1,1,P78,"sports, art",video,2.2
1,U1,2,P22,"sports, art",audio,2.1
2,U1,3,P46,"food, sports",video,1.2
3,U2,1,P80,"travel, food",video,2.2
4,U2,2,P5,"food, fashion",image,2.16
5,U2,3,P69,"fashion, food",text,2.12
6,U3,1,P34,"travel, sports",video,2.2
7,U3,2,P39,"travel, sports",video,2.2
8,U3,3,P97,"travel, sports",video,2.2
9,U4,1,P96,"music, fashion",video,2.2


In [5]:

# Simple offline evaluation (Precision@3)
# We'll approximate by checking if any recommended post for a user was engaged in a later time period.
# Since we don't have timestamps, we do a simple holdout: for users with >=4 engagements, holdout last engagement as test.
from collections import defaultdict
# Build user -> engagements list in order given (assumes file order approximates time)
user_to_posts = engagements.groupby('user_id')['post_id'].apply(list).to_dict()
train_eng = []
test_eng = []
for u, posts_list in user_to_posts.items():
    if len(posts_list) >= 4:
        train_eng.extend([(u,p) for p in posts_list[:-1]])
        test_eng.append((u, posts_list[-1]))
    else:
        train_eng.extend([(u,p) for p in posts_list])
# Create DataFrame from train engagements and recompute recommendations on that train set
train_df = pd.DataFrame(train_eng, columns=['user_id','post_id'])
test_df = pd.DataFrame(test_eng, columns=['user_id','post_id'])

# Generate recommendations using train_df as seen engagements
all_recs_train = []
for _, u in users.iterrows():
    recs = recommend_for_user(u, posts, train_df, top_k=3)
    all_recs_train.append((u['user_id'], recs['post_id'].tolist()))

# Compute Precision@3 on test set
hits = 0
total = len(test_df)
for _, row in test_df.iterrows():
    user = row['user_id']
    true_post = row['post_id']
    recs = next((r for (uid,r) in all_recs_train if uid==user), [])
    if true_post in recs:
        hits += 1
precision_at_3 = hits / total if total>0 else np.nan
precision_at_3


0.12

In [6]:

# Save final outputs and provide quick summary
print("Recommendations saved to AMBRIX_recommendations.csv")
print("Precision@3 (approx) calculated above. If NaN, not enough test samples in dataset.")

# Display top recommendations for first 5 users
display(recs_df[recs_df['user_id'].isin(users['user_id'].head(5))].sort_values(['user_id','rank']))


Recommendations saved to AMBRIX_recommendations.csv
Precision@3 (approx) calculated above. If NaN, not enough test samples in dataset.


Unnamed: 0,user_id,rank,post_id,tags,content_type,score
0,U1,1,P78,"sports, art",video,2.2
1,U1,2,P22,"sports, art",audio,2.1
2,U1,3,P46,"food, sports",video,1.2
3,U2,1,P80,"travel, food",video,2.2
4,U2,2,P5,"food, fashion",image,2.16
5,U2,3,P69,"fashion, food",text,2.12
6,U3,1,P34,"travel, sports",video,2.2
7,U3,2,P39,"travel, sports",video,2.2
8,U3,3,P97,"travel, sports",video,2.2
9,U4,1,P96,"music, fashion",video,2.2
