In [3]:
# lib import
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse as sp


In [5]:
# Data load and explore
users=pd.read_csv("/content/Assessment data - users.csv")
posts=pd.read_csv("/content/Assessment data - posts.csv")

In [6]:
users['interested_in'] = users['interested_in'].fillna('')
posts['content'] = posts['content'].fillna('')
posts['topics'] = posts['topics'].fillna('')
posts['like_user_ids'] = posts['like_user_ids'].fillna('')

In [7]:
users['user_text'] = users['interested_in']
posts['post_text'] = posts['content']

In [8]:
# embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
post_embeddings = model.encode(posts['post_text'].tolist(), convert_to_tensor=True, normalize_embeddings=True)
user_embeddings = model.encode(users['user_text'].tolist(), convert_to_tensor=True, normalize_embeddings=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
import torch
user_emb_tensor = torch.tensor(user_embeddings)
post_emb_tensor = torch.tensor(post_embeddings)
content_sim_matrix = util.cos_sim(user_emb_tensor, post_emb_tensor).cpu().numpy()

  user_emb_tensor = torch.tensor(user_embeddings)
  post_emb_tensor = torch.tensor(post_embeddings)


In [10]:
# Collabrative filtering
rows, cols, data = [], [], []

for idx, row in posts.iterrows():
    if pd.notna(row['like_user_ids']):
        try:
            liked_users = [int(x) for x in str(row['like_user_ids']).split(',') if x.strip().isdigit()]
            for u in liked_users:
                if u in users['user_id'].values:
                    rows.append(np.where(users['user_id']==u)[0][0])
                    cols.append(idx)
                    data.append(1)  # implicit feedback
        except:
            continue


In [11]:
n_users = len(users)
n_posts = len(posts)
user_item_matrix = sp.coo_matrix((data, (rows, cols)), shape=(n_users, n_posts)).toarray()


In [13]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, n_iter=10, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
post_factors = svd.components_.T

In [14]:
# cf score
cf_scores = np.dot(user_factors, post_factors.T)

In [25]:
scaler = MinMaxScaler()
posts['shares_norm'] = scaler.fit_transform(posts[['shares']].fillna(0))
posts['likes_norm'] = scaler.fit_transform(posts[['likes']].fillna(0))


In [26]:
# popular scores
w_likes = 0.4
w_shares = 0.3
posts['popularity_score'] = (w_likes * posts['likes_norm'] +
                             w_shares * posts['shares_norm'])

In [27]:
# hybrid scoring
hybrid_scores = 0.5 * content_sim_matrix + 0.3 * cf_scores + 0.2 * posts['popularity_score'].values

In [32]:
top_k = 10
recommendations = []

for i, user_id in enumerate(users['user_id']):
    top_post_indices = hybrid_scores[i].argsort()[::-1][:top_k]
    recommended_post_ids = posts.iloc[top_post_indices]['post_id'].tolist()
    recommendations.append({
        'user_id': user_id,
        'recommended_post_ids': ','.join(map(str, recommended_post_ids))
    })

recommendation_df = pd.DataFrame(recommendations)
recommendation_df.to_csv("boom_recommendations2.csv", index=False, quoting=1)
print(recommendation_df.head())

   user_id                               recommended_post_ids
0        2  4257,3030,4858,4217,2027,2501,1879,2727,1920,3733
1        3  4257,3922,2617,2501,2056,2027,4310,1879,3967,2727
2        4  4257,2501,3922,2027,1879,4217,2617,4858,2056,4310
3        5  4257,3030,2727,2027,2501,1879,3733,2056,2617,4217
4        6  3030,3922,2027,1879,2056,4217,2727,1583,4310,3733
