In [42]:
!pip install lightfm



In [43]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from datetime import datetime
import pprint

In [44]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
user_interactions = pd.read_csv('/content/drive/My Drive/user_interaction.csv')
meta_data = pd.read_csv('/content/drive/My Drive/metadata.csv')

In [46]:
user_interactions['updated_at'] = pd.to_datetime(user_interactions['updated_at'])
user_interactions = user_interactions.sort_values(by='updated_at')

In [47]:

null_rows_in_user_interactions = meta_data[meta_data.isnull().any(axis=1)]
print("Null rows in user interactions")
null_rows_in_user_interactions


Null rows in user interactions


Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at
239986,-2270332352057132,1377786215619021,mythology,332,2020-01-24 05:35:03,
278038,-4110497246016881,-524224996141368,romance,73,2019-09-05 10:26:20,
297055,-2270332347709578,1377786216959710,romance,89,2019-08-06 10:01:31,
578192,-2270332351858801,1377786215762463,shortstories,55,2019-09-09 10:10:57,
760065,-2270332352057132,1377786215619021,experiences-and-memories,332,2020-01-24 05:35:03,
788161,-2270332351858801,1377786215762463,social,55,2019-09-09 10:10:57,
869075,-2270332352057132,1377786215619021,women,332,2020-01-24 05:35:03,


In [48]:
user_interactions['pratilipi_id'] = user_interactions['pratilipi_id'].astype(str)
meta_data['pratilipi_id'] = meta_data['pratilipi_id'].astype(str)
meta_data['author_id'] = meta_data['author_id'].astype(str)

In [49]:
meta_data['categories'] = meta_data['category_name'].apply(lambda x: [x.strip()])

In [50]:
split_idx = int(len(user_interactions)*0.75)
train_df = user_interactions.iloc[:split_idx]
test_df = user_interactions.iloc[split_idx:]

In [51]:
print(f"Train Size:{len(train_df)}")
print(f"Test Size:{len(test_df)}")

Train Size:1875000
Test Size:625000


In [52]:
valid_pratilipis = meta_data['pratilipi_id'].unique()
train_df = train_df[train_df['pratilipi_id'].isin(valid_pratilipis)]
print("Train Size After Filtering:", len(train_df))

Train Size After Filtering: 1129556


In [53]:

all_categories = []
for categories in meta_data['categories']:
    all_categories.extend(categories)
all_categories = list(set(all_categories))
print(f"Number of unique categories: {len(all_categories)}")

Number of unique categories: 45


In [54]:
# Create Dataset and fit with users, items, and item features
dataset = Dataset() #initializes a new Dataset object. This object will hold the mappings for users, items, and features.

# Use only training data to fit the dataset
train_users = train_df['user_id'].unique()
train_pratilipis = train_df['pratilipi_id'].unique()


In [55]:
# This is the key fix - include item_features in the fit method
dataset.fit(
    users=train_users,
    items=train_pratilipis,
    item_features=all_categories
)

In [56]:
train_interactions = train_df.groupby(['user_id', 'pratilipi_id']).max().reset_index()
train_interactions['weight'] = train_interactions['read_percent']/100.0

In [57]:
# Build interactions matrix
interactions, weights = dataset.build_interactions(
    [(row['user_id'], row['pratilipi_id'], row['weight']) for _, row in train_interactions.iterrows()]
)
print(f"Interactions matrix shape: {interactions.shape}")

Interactions matrix shape: (183494, 124599)


In [58]:
# Filter meta_data to include only pratilipi_id present in train_df
meta_data_filter = meta_data[meta_data['pratilipi_id'].isin(train_df['pratilipi_id'])]
print("Meta Data Size After Filtering:", len(meta_data_filter))

Meta Data Size After Filtering: 308108


In [59]:
# Prepare item features list
item_features_list = [(row['pratilipi_id'], row['categories']) for _, row in meta_data_filter.iterrows()]

# Build item features - this should now work
item_features = dataset.build_item_features(item_features_list)
print(f"Item features shape: {item_features.shape}")

Item features shape: (124599, 124644)


In [60]:
# Train LightFM model
model = LightFM(loss='warp', random_state=42)
model.fit(interactions, item_features=item_features, epochs=30, num_threads=4)

print("Model trained successfully!")

Model trained successfully!


In [61]:
def get_recommendations(model, user_id, item_features=None, train_interactions_df=None, n=5):
    n_items = item_features.shape[0]

    # Get the mappings from dataset
    mappings = dataset.mapping()
    user_id_map = mappings[0]  # The first element is the user mapping

    # Handle cold start users
    if user_id not in user_id_map:
        # Cold start: Recommend based on item features alone (content-based recommendation)
        scores = model.predict(0, np.arange(n_items), item_features=item_features)  # Predict using item features
        top_items = np.argsort(-scores)[:n]  # Get top items based on features
    else:
        # Non-cold start: User has interactions in the training data
        internal_user_id = user_id_map[user_id]

        # Get scores for all items for this user
        scores = model.predict(internal_user_id, np.arange(n_items), item_features=item_features)

        # Sort by score and get top items
        top_items = np.argsort(-scores)[:n]

    # Get the item mapping from the correct position in the tuple
    item_map = mappings[2]  # The third element is the item mapping
    item_map_reverse = {v: k for k, v in item_map.items()}

    # Get items that user has already interacted with (to avoid recommending the same items)
    if train_interactions_df is not None:
        seen_items = set(train_interactions_df[train_interactions_df['user_id'] == user_id]['pratilipi_id'].values)
    else:
        seen_items = set()

    # Final list of top items, filtering out already seen items
    top_items_filtered = []
    for item_idx in top_items:
        if len(top_items_filtered) >= n:
            break
        if item_idx in item_map_reverse:
            item_id = item_map_reverse[item_idx]
            if item_id not in seen_items:
                top_items_filtered.append(item_id)

    return top_items_filtered


In [62]:

# Take a random sample of test data for faster evaluation
sample_size = min(20000, len(test_df))  # Adjust the sample size as needed
test_df_sample = test_df.sample(n=sample_size, random_state=42)

# Process the sampled test data
test_interactions_sample = test_df_sample.groupby(['user_id', 'pratilipi_id']).max().reset_index()
test_interactions_sample['weight'] = test_interactions_sample['read_percent'] / 100.0

# Only include users and items that are in the training set
test_interactions_sample = test_interactions_sample[
    test_interactions_sample['user_id'].isin(train_interactions['user_id']) &
    test_interactions_sample['pratilipi_id'].isin(train_interactions['pratilipi_id'])
]

# Build interaction matrix from the sample
test_interactions_matrix_sample, _ = dataset.build_interactions(
    [(row['user_id'], row['pratilipi_id'], row['weight'])
     for _, row in test_interactions_sample.iterrows()]
)

print(f"Evaluating on {len(test_interactions_sample)} test interactions")
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k

# Calculate metrics on the sample
precision = precision_at_k(model, test_interactions_matrix_sample, item_features=item_features, k=5).mean()
auc = auc_score(model, test_interactions_matrix_sample, item_features=item_features).mean()
recall = recall_at_k(model, test_interactions_matrix_sample, item_features=item_features, k=5).mean()
print(f"Precision@5: {precision:.4f}")
print(f"AUC: {auc:.4f}")
print(f"Recall@5: {recall:.4f}")


Evaluating on 9945 test interactions
Precision@5: 0.0011
AUC: 0.8250
Recall@5: 0.0051
