In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Import datasets

In [2]:
users = pd.read_csv("Users.csv")
posts = pd.read_csv("Posts.csv")
engagements = pd.read_csv("Engagements.csv")

print(users.head())
print(posts.head())
print(engagements.head())

  user_id  age gender          top_3_interests  past_engagement_score
0      U1   24      F      sports, art, gaming                   0.61
1      U2   32      F    travel, food, fashion                   0.93
2      U3   28  Other  sports, travel, fashion                   0.40
3      U4   25      M     fashion, music, tech                   0.53
4      U5   24      M   fashion, food, fitness                   0.80
  post_id creator_id content_type            tags
0      P1        U44        video    sports, food
1      P2        U26        video   music, travel
2      P3        U32         text  sports, travel
3      P4         U6        image   music, gaming
4      P5        U32        image   food, fashion
  user_id post_id  engagement
0      U1     P52           1
1      U1     P44           0
2      U1      P1           1
3      U1      P4           1
4      U1     P65           0


In [3]:
users.shape

(50, 5)

# Cleaning and Preprocessing

In [4]:
# Clean text fields, inspect missing values, and create unified text columns for modeling.
def build_text_fields(users, posts):
    posts = posts.copy()
    posts['tags'] = posts['tags'].fillna('').astype(str)
    posts['content_type'] = posts['content_type'].fillna('').astype(str)
    posts['text'] = (posts['tags'] + ' ' + posts['content_type']).str.replace(',', ' ')

    users = users.copy()
    users['top_3_interests'] = users['top_3_interests'].fillna('').astype(str)
    users['text'] = users['top_3_interests'].str.replace(',', ' ')

    return users, posts



users, posts = build_text_fields(users, posts)

print(users[['user_id','text']].head())
print(posts[['post_id','text']].head())


  user_id                     text
0      U1      sports  art  gaming
1      U2    travel  food  fashion
2      U3  sports  travel  fashion
3      U4     fashion  music  tech
4      U5   fashion  food  fitness
  post_id                 text
0      P1   sports  food video
1      P2  music  travel video
2      P3  sports  travel text
3      P4  music  gaming image
4      P5  food  fashion image


In [5]:
print(users[['user_id','text']].head())
print(posts[['post_id','text']].head())

  user_id                     text
0      U1      sports  art  gaming
1      U2    travel  food  fashion
2      U3  sports  travel  fashion
3      U4     fashion  music  tech
4      U5   fashion  food  fitness
  post_id                 text
0      P1   sports  food video
1      P2  music  travel video
2      P3  sports  travel text
3      P4  music  gaming image
4      P5  food  fashion image


# Build Embeddings and get similarity Scores

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build TF-IDF embeddings
def build_tfidf_embeddings(users, posts, max_features=5000):
    corpus = list(posts['text'].astype(str)) + list(users['text'].astype(str))
    vec = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    tfidf = vec.fit_transform(corpus)

    posts_tfidf = tfidf[:len(posts), :]   # first part = posts
    users_tfidf = tfidf[len(posts):, :]   # second part = users

    return vec, posts_tfidf, users_tfidf

vec, posts_tfidf, users_tfidf = build_tfidf_embeddings(users, posts)

# Example: similarity between U1 and all posts
user_idx = 0  # U1
user_vec = users_tfidf[user_idx]
content_scores = cosine_similarity(user_vec, posts_tfidf).flatten()

print("Scores for U1:", content_scores)


Scores for U1: [0.08415727 0.         0.08205357 0.1070333  0.         0.12442529
 0.         0.         0.         0.16795545 0.         0.
 0.15718159 0.         0.         0.         0.09784576 0.
 0.         0.         0.12040041 0.5132872  0.         0.1339772
 0.15538247 0.         0.         0.10241839 0.07829219 0.07949901
 0.         0.         0.15538247 0.09309931 0.         0.
 0.         0.         0.09309931 0.08075762 0.         0.
 0.10108838 0.         0.         0.09784576 0.13201982 0.08540065
 0.10867938 0.12442529 0.         0.09393258 0.         0.
 0.15654304 0.07962646 0.08381579 0.09784576 0.15336104 0.09393258
 0.12336389 0.15336104 0.         0.         0.12442529 0.11890016
 0.12442529 0.         0.         0.12505337 0.         0.08372652
 0.         0.         0.15538247 0.         0.         0.53576966
 0.         0.         0.11474597 0.11013459 0.11885273 0.
 0.11116192 0.         0.15654304 0.         0.         0.
 0.         0.         0.         0. 

# Recommendation function

In [7]:
def recommend_top_posts(user_idx, users_tfidf, posts_tfidf, posts, top_n=3):
    user_vec = users_tfidf[user_idx]
    scores = cosine_similarity(user_vec, posts_tfidf).flatten()
    top_idx = np.argsort(scores)[::-1][:top_n]   # sort descending
    rec_posts = posts.iloc[top_idx][['post_id','text']]
    rec_scores = scores[top_idx]
    return rec_posts, rec_scores

# Example: Recommendations for U1
rec_posts, rec_scores = recommend_top_posts(0, users_tfidf, posts_tfidf, posts)
print("Recommended posts for U1:\n", rec_posts)
print("Scores:", rec_scores)

Recommended posts for U1:
    post_id               text
77     P78  sports  art video
21     P22  sports  art audio
9      P10          art video
Scores: [0.53576966 0.5132872  0.16795545]


# Engagement Matrix

In [8]:
def build_engagement_matrix(users, posts, engagements):
    user_ids = users['user_id'].tolist()
    post_ids = posts['post_id'].tolist()
    user_index = {uid:i for i,uid in enumerate(user_ids)}
    post_index = {pid:i for i,pid in enumerate(post_ids)}

    M = np.zeros((len(post_ids), len(user_ids)))  # posts x users
    for _, row in engagements.iterrows():
        pid = row['post_id']
        uid = row['user_id']
        eng = row['engagement']
        if pid in post_index and uid in user_index:
            M[post_index[pid], user_index[uid]] = eng
    return M, post_index, user_index

M, post_index, user_index = build_engagement_matrix(users, posts, engagements)
print("Engagement matrix shape:", M.shape)

Engagement matrix shape: (100, 50)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(M)  # posts x posts similarity
print("Item-item similarity matrix shape:", item_sim.shape)

Item-item similarity matrix shape: (100, 100)


In [10]:
def post_popularity(posts, engagements):
    pop = engagements.groupby('post_id')['engagement'].sum().reindex(posts['post_id']).fillna(0)
    # Normalize between 0-1
    if pop.max() > 0:
        pop_norm = (pop - pop.min()) / (pop.max() - pop.min())
    else:
        pop_norm = pop
    return pop_norm

pop_scores = post_popularity(posts, engagements)
print(pop_scores.head())


post_id
P1    1.000000
P2    0.166667
P3    0.333333
P4    0.500000
P5    0.750000
Name: engagement, dtype: float64


# Hybrid based Recommendation system

In [11]:
def recommend_hybrid(user_idx, users, posts, posts_tfidf, users_tfidf,
                     item_sim, pop_scores, top_k=3,
                     w_content=0.6, w_collab=0.3, w_pop=0.1):
    user_vec = users_tfidf[user_idx]
    content_scores = cosine_similarity(user_vec, posts_tfidf).flatten()

    # Collaborative score: average similarity with posts user already engaged
    user_id = users.iloc[user_idx]['user_id']
    engaged_posts = engagements[(engagements['user_id']==user_id) & (engagements['engagement']>0)]['post_id'].tolist()
    collab_scores = np.zeros(len(posts))
    for pid in engaged_posts:
        if pid in post_index:
            collab_scores += item_sim[post_index[pid]]
    if engaged_posts:
        collab_scores /= len(engaged_posts)

    # Final hybrid score
    final_score = w_content*content_scores + w_collab*collab_scores + w_pop*pop_scores.values

    # Top-k posts
    top_idx = np.argsort(final_score)[::-1][:top_k]
    rec_posts = posts.iloc[top_idx][['post_id','text']]
    rec_scores = final_score[top_idx]
    return rec_posts, rec_scores


In [12]:
rec_posts, rec_scores = recommend_hybrid(0, users, posts, posts_tfidf, users_tfidf, item_sim, pop_scores)
print("Hybrid Top-3 Recommendations for U1:")
print(rec_posts)
print("Scores:", rec_scores)


Hybrid Top-3 Recommendations for U1:
   post_id                text
77     P78   sports  art video
21     P22   sports  art audio
0       P1  sports  food video
Scores: [0.38818993 0.38601534 0.23754012]


In [13]:
results = []
for idx in range(len(users)):
    rec_posts, rec_scores = recommend_hybrid(idx, users, posts, posts_tfidf, users_tfidf, item_sim, pop_scores)
    results.append({
        'user_id': users.iloc[idx]['user_id'],
        'recommended_posts': list(rec_posts['post_id']),
        'scores': list(rec_scores)
    })

final_df = pd.DataFrame(results)
final_df.to_csv("top3_recommendations.csv", index=False)
print(final_df.head())


  user_id recommended_posts                                             scores
0      U1    [P78, P22, P1]  [0.38818992927601115, 0.38601533602027516, 0.2...
1      U2    [P5, P80, P46]  [0.42718843593319855, 0.35999032290531136, 0.2...
2      U3    [P42, P3, P39]  [0.5194141549434416, 0.35162111295680704, 0.23...
3      U4   [P37, P16, P53]  [0.2841452453718329, 0.23519409610758035, 0.23...
4      U5    [P7, P26, P69]  [0.4611984689453777, 0.41811938804383586, 0.37...


In [20]:
# import matplotlib.pyplot as plt
# import networkx as nx

# # Define graph
# G = nx.DiGraph()

# # Nodes
# nodes = [
#     "Users.csv", "Posts.csv", "Engagements.csv",
#     "Preprocessing & Cleaning",
#     "TF-IDF Vectorization (Content-based)",
#     "Engagement Matrix (Collaborative)",
#     "Popularity Score Calculation",
#     "Weighted Hybrid Scoring",
#     "Top-3 Recommendations"
# ]
# G.add_nodes_from(nodes)

# # Edges
# edges = [
#     ("Users.csv", "Preprocessing & Cleaning"),
#     ("Posts.csv", "Preprocessing & Cleaning"),
#     ("Engagements.csv", "Preprocessing & Cleaning"),
#     ("Preprocessing & Cleaning", "TF-IDF Vectorization (Content-based)"),
#     ("Preprocessing & Cleaning", "Engagement Matrix (Collaborative)"),
#     ("Preprocessing & Cleaning", "Popularity Score Calculation"),
#     ("TF-IDF Vectorization (Content-based)", "Weighted Hybrid Scoring"),
#     ("Engagement Matrix (Collaborative)", "Weighted Hybrid Scoring"),
#     ("Popularity Score Calculation", "Weighted Hybrid Scoring"),
#     ("Weighted Hybrid Scoring", "Top-3 Recommendations")
# ]
# G.add_edges_from(edges)

# # Draw
# plt.figure(figsize=(14,8))
# pos = nx.spring_layout(G, seed=42)

# nx.draw_networkx_nodes(G, pos, node_size=8000, node_color="lightblue", edgecolors="black")
# nx.draw_networkx_edges(G, pos, arrowstyle="->", arrowsize=20, edge_color="gray")
# nx.draw_networkx_labels(G, pos, font_size=6, font_weight="bold")

# plt.title("Hybrid Recommendation System Architecture", fontsize=14, fontweight="bold")
# plt.axis("off")
# plt.show()
