In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score
import pickle
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Load datasets
user_df = pd.read_csv("user_interaction.csv")
meta_df = pd.read_csv("metadata.csv")

In [3]:
user_df['updated_at'] =pd.to_datetime(user_df['updated_at'])
meta_df['updated_at'] = pd.to_datetime(meta_df['updated_at'])
meta_df['published_at'] = pd.to_datetime(meta_df['published_at'])

In [4]:
# Keep only rows where read_percent is in the valid range (0-100)
user_df = user_df[user_df['read_percent'] <= 100]

## Train-Test split

In [5]:
# Time-based Train-Test Split
user_df = user_df.sort_values(by="updated_at")
split = int(len(user_df) * 0.75)
train_interactions = user_df.iloc[ : split]
test_interactions = user_df.iloc[split : ]

## Feature Engineering

In [6]:
meta_df["num_users_read"] = user_df.groupby("pratilipi_id")["user_id"].transform("count").fillna(0)
meta_df["recency"] = (meta_df["updated_at"] - meta_df["published_at"]).dt.days
meta_df["category_encoded"] = meta_df["category_name"].astype("category").cat.codes
meta_df["author_encoded"] = meta_df["author_id"].astype("category").cat.codes
meta_df["author_popularity"] = meta_df.groupby("author_id")["pratilipi_id"].transform("count").fillna(0)
meta_df.head()

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at,num_users_read,recency,category_encoded,author_encoded,author_popularity
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04,1943.0,1419.0,42,2288,2
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48,267.0,955.0,42,7727,3
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12,9.0,840.0,42,7317,5
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53,5.0,20.0,42,5575,14
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23,109.0,526.0,42,13272,22


In [7]:
# Normalize Features
scaler = MinMaxScaler()
meta_df[["num_users_read", "recency", "author_popularity"]] = scaler.fit_transform(meta_df[["num_users_read", "recency", "author_popularity"]])
meta_df.head()

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at,num_users_read,recency,category_encoded,author_encoded,author_popularity
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04,0.564863,0.551242,42,2288,0.000115
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48,0.077371,0.371118,42,7727,0.000229
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12,0.002327,0.326475,42,7317,0.000459
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53,0.001163,0.008152,42,5575,0.00149
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23,0.031414,0.204581,42,13272,0.002407


# Evaluation Metrices

In [8]:
def precision_at_k(recommendations, ground_truth, k=5):
    precision = 0
    if recommendations:
        precision = len(set(recommendations[:k]) & set(ground_truth)) / k
    return precision

In [9]:
def recall_at_k(recommendations, ground_truth, k=5):
    recall = 0
    if ground_truth:
        recall = len(set(recommendations[:k]) & set(ground_truth)) / len(ground_truth)
    return recall

In [10]:
def ndcg_at_k(recommendations, ground_truth, k=5):
    if not ground_truth:
        return 0

    # relevance score list for the ground truth
    relevance = [1 if item in ground_truth else 0 for item in recommendations[:k]]
    
    # Ensure ground_truth is a list of relevance scores
    true_relevance = [1 if item in ground_truth else 0 for item in recommendations]

    if not relevance or not true_relevance:
        return 0

    return ndcg_score([true_relevance], [relevance], k=k)

In [11]:
# Evaluates how early first relevant item appears
def mrr(recommendations, ground_truth):
    for rank, item in enumerate(recommendations, start=1):
        if item in ground_truth:
            return 1 / rank
    return 0

# Popularity based Model (mostly for new users)

In [12]:
def recommend_popular(user_id, n=5):
    # find all pratilipis
    user_categories = train_interactions[train_interactions['user_id'] == user_id]['pratilipi_id']
    
    if user_categories.empty:
        return meta_df.sort_values(by=['num_users_read', 'author_popularity'], ascending=False)['pratilipi_id'].head(n).tolist()
        
    # find top 3 categories that user read
    top_categories = meta_df[meta_df['pratilipi_id'].isin(user_categories)]["category_name"].value_counts().index[:3]
    return meta_df[meta_df["category_name"].isin(top_categories)].sort_values(by=['num_users_read', 'author_popularity'], ascending=False)['pratilipi_id'].head(n).tolist()

## Collaborative filtering (SVD)

In [13]:
# Define reader for Surprise library
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(train_interactions[['user_id', 'pratilipi_id', 'read_percent']], reader)
trainset = data.build_full_trainset()

In [14]:
# Train Collaborative Filtering Model
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x248150ee5d0>

In [15]:
# Extract User and Item Embeddings
user_factors = pd.DataFrame(svd_model.pu, index=[trainset.to_raw_uid(i) for i in range(len(svd_model.pu))])
item_factors = pd.DataFrame(svd_model.qi, index=[trainset.to_raw_iid(i) for i in range(len(svd_model.qi))])

In [16]:
all_items = set(meta_df['pratilipi_id'])

In [17]:
def recommend_svd(user_id, n=5):
    
    user_items = set(train_interactions[train_interactions['user_id'] == user_id]['pratilipi_id'])
    items_to_predict = list(all_items - user_items)
    
    predictions = []
    for item_id in items_to_predict:
        try:
            pred = svd_model.predict(str(user_id), str(item_id)).est
            predictions.append((item_id, pred))
        except:
            continue
    
    # top n predictions
    predictions.sort(key=lambda x: x[1], reverse=True)
    return [x[0] for x in predictions[:n]]

In [18]:
# SVD-Based Collaborative Filtering
def recommend_svd(user_id, n=5):
    predictions = [svd_model.predict(user_id, pid) for pid in meta_df['pratilipi_id']]
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    return [pred.iid for pred in top_predictions]

## Content based Filtering (Metadata)

In [19]:
#feature_matrix = meta_df[["category_encoded", "author_encoded", "num_users_read", "recency", "author_popularity"]]
feature_matrix = meta_df[["category_encoded", "author_encoded", "num_users_read", "author_popularity"]]
feature_matrix = feature_matrix.fillna(0)

# Nearest Neighbors with cosine distance
knn_model = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=10)
knn_model.fit(feature_matrix)

In [20]:
def recommend_content(user_id, n=5):
        # pratilipis the user has interacted with
    user_interactions = train_interactions[train_interactions['user_id'] == user_id]['pratilipi_id'].tolist()
    
    # cold-start users (no prior interactions)
    if not user_interactions:
        return meta_df["pratilipi_id"].sample(n).tolist()
    
    recommended_pratilipis = set()
    
    for pratilipi_id in user_interactions:
        
        if pratilipi_id not in meta_df["pratilipi_id"].values:
            continue
        idx = meta_df.index[meta_df["pratilipi_id"] == pratilipi_id].tolist()[0]
        
        # top similar pratilipis using KNN
        distances, indices = knn_model.kneighbors([feature_matrix.iloc[idx]], n_neighbors=n+1)
        
        for i in indices[0][1:]:
            recommended_pratilipis.add(meta_df.iloc[i]["pratilipi_id"])
    
    # top-N unique recommendations
    return list(recommended_pratilipis)[:n]

## Hybrid Model (SVD + XGBoost)

In [21]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

In [None]:
# merge meta_data with item_factors(svd)
pratilipi_features = meta_df.merge(item_factors, left_on="pratilipi_id", right_index=True, how="left").fillna(0)

train_interactions['pratilipi_id_encoded'] = LabelEncoder().fit_transform(train_interactions['pratilipi_id'])
X_train = train_interactions.merge(pratilipi_features, on='pratilipi_id', how='inner')

columns_to_drop = ['pratilipi_id', 'updated_at_x', 'published_at','updated_at_y', 'category_name', 'author_id']
X_train = X_train.drop(columns=[col for col in columns_to_drop if col in X_train.columns])

# categorical columns to 'category' type
X_train['category_encoded'] = X_train['category_encoded'].astype('category')
X_train['author_encoded'] = X_train['author_encoded'].astype('category')

y_train = X_train['read_percent']
train_group = X_train.groupby("user_id").size().values

# Initialize and fit XGBRanker
xgb_ranker = XGBRanker(
    objective="rank:pairwise",tree_method='gpu_hist', enable_categorical=True,
    n_estimators = 100, max_depth = 5, learning_rate = 0.1, gamma = 0.1
)

# grid_search = GridSearchCV(estimator=xgb_ranker, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
# grid_search.fit(X_train.drop(columns=['read_percent']), y_train, group=train_group)

xgb_ranker.fit(X_train.drop(columns=['read_percent']), y_train, group=train_group)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_interactions['pratilipi_id_encoded'] = LabelEncoder().fit_transform(train_interactions['pratilipi_id'])


In [None]:
def recommend_hybrid(user_id, n=5):
    # recommendations from SVD
    svd_recs = recommend_svd(user_id, n * 2)  # get more recommendations to rank later
    
    # Filter out recommendations that are not in meta_df
    valid_recs = [rec for rec in svd_recs if rec in meta_df['pratilipi_id'].values]
    
    if not valid_recs:
        return recommend_popular(user_id, n)
    
    # Select features for valid recommendations
    X_input = pratilipi_features.loc[pratilipi_features['pratilipi_id'].isin(valid_recs)].copy()
    
    X_input['user_id'] = user_id 
    
    # Encode pratilipi_id for the current recommendations
    X_input['pratilipi_id_encoded'] = LabelEncoder().fit_transform(X_input['pratilipi_id'])
    
    required_features = ['user_id', 'pratilipi_id_encoded', 'reading_time', 
                         'num_users_read', 'recency', 'category_encoded', 
                         'author_encoded', 'author_popularity'] + list(item_factors.columns)
    
    X_input = X_input[required_features]
    ranking_scores = xgb_ranker.predict(X_input)
    
    #ranking_scores = best_xgb_ranker.predict(X_input)
    
    ranked_recs = [x for _, x in sorted(zip(ranking_scores, valid_recs), reverse=True)]
    
    return ranked_recs[:n]

## Model Evaluation

In [None]:
def evaluate_model(model_func, test_users, k=5):
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    mrr_scores = []
    
    for user in test_users:
        ground_truth = test_interactions[test_interactions['user_id'] == user]['pratilipi_id'].tolist()
        recommendations = model_func(user, n=k)

        prec = precision_at_k(recommendations, ground_truth, k)
        precision_scores.append(prec)
        rec = recall_at_k(recommendations, ground_truth, k)
        recall_scores.append(rec)
        ndcg = ndcg_at_k(recommendations, ground_truth, k)
        ndcg_scores.append(ndcg)
        mrr_sc = mrr(recommendations, ground_truth)
        mrr_scores.append(mrr_sc)
        
        #print(prec, rec, ndcg, mrr_sc)
    
    return {
        "Precision@K": np.mean(precision_scores),
        "Recall@K": np.mean(recall_scores),
        "NDCG@K": np.mean(ndcg_scores),
        "MRR": np.mean(mrr_scores)
    }

In [None]:
# Evaluate All Models
test_users = test_interactions['user_id'].unique()[:100]

In [32]:
print(f"Popularity Model: {evaluate_model(recommend_popular, test_users, 5)}")

Popularity Model: {'Precision@K': 0.0, 'Recall@K': 0.0, 'NDCG@K': 0.0, 'MRR': 0.0}


In [33]:
print(f"Content-Based Model: {evaluate_model(recommend_content, test_users, 5)}")

Content-Based Model: {'Precision@K': 0.0, 'Recall@K': 0.0, 'NDCG@K': 0.0, 'MRR': 0.0}


In [None]:
print(f"Collaborative Model (SVD): {evaluate_model(recommend_svd, test_users, 5)}")

In [None]:
print(f"Hybrid Model (SVD + XGBoost): {evaluate_model(recommend_hybrid, test_users, 5)}")

## Example Usage

In [None]:
def recommend_for_user(user_id, model_func, n=5):
    array = model_func(user_id, n)
    recommended_df = meta_df[meta_df['pratilipi_id'].isin(array)].copy()
    return recommended_df  

In [None]:
user_id = 5506791979266119
model_arr = [recommend_popular, recommend_content, recommend_svd, recommend_hybrid]
recommendations = {}

for model in model_arr:
    recommendations[model.__name__] = recommend_for_user(user_id, model, 5) 

# ground_truth = test_interactions[test_interactions['user_id'] == user_id]['pratilipi_id'].tolist()
# print("Ground Truth:", ground_truth)

In [None]:
recommendations["recommend_popular"].head(5)

In [None]:
recommendations["recommend_content"].head(5)

In [None]:
recommendations["recommend_svd"].head(5)

In [None]:
recommendations["recommend_hybrid"].head(5)