In [None]:
#Import#
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# ========================
# Data Loading and Preprocessing
# ========================
def load_and_preprocess_data():
    """
    Loads user interaction and metadata CSV files, merges them, and preprocesses the data.
    Returns:
        merged_data (DataFrame): Combined dataset with all columns
        meta_data (DataFrame): Processed metadata with read_count included
    """
    try:
        user_interaction = pd.read_csv('User_interaction.csv')
        meta_data = pd.read_csv('Metadata.csv')
    except FileNotFoundError as e:
        print("File not found:", e)
        return None, None

    print("User Interaction Columns:", user_interaction.columns)
    print("Meta Data Columns:", meta_data.columns)

    # Merge datasets
    merged_data = pd.merge(user_interaction, meta_data, on='pratilipi_id', how='inner')
    
    print("\n🔎 Unique Users in Merged Data:", merged_data['user_id'].nunique())
    print("🔎 Is User 5506791961876448 Present?", 5506791961876448 in merged_data['user_id'].values)
    
    # Convert 'read_percent' to binary (Read >50% is 1, otherwise 0)
    merged_data['read_binary'] = (merged_data['read_percent'] > 50).astype(int)

    # Compute `read_count` per story
    read_counts = merged_data.groupby('pratilipi_id')['user_id'].count().reset_index()
    read_counts.rename(columns={'user_id': 'read_count'}, inplace=True)
    meta_data = meta_data.merge(read_counts, on='pratilipi_id', how='left').fillna(0)

    print("\n📊 First 5 Entries of Merged Dataset:")
    print(merged_data.head())
    
    return merged_data, meta_data

# ========================
# Content-Based Filtering
# ========================
def content_based_filtering(meta_data):
    """
    Creates a TF-IDF vectorizer model for content-based filtering based on category names.
    Returns:
        nn_model (NearestNeighbors): Trained Nearest Neighbors model
        vectorizer (TfidfVectorizer): TF-IDF vectorizer
    """
    vectorizer = TfidfVectorizer()
    category_matrix = vectorizer.fit_transform(meta_data['category_name'].astype(str))
    
    nn_model = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=10, n_jobs=-1)
    nn_model.fit(category_matrix)
    
    print("✅ Story Similarity Computed Using Nearest Neighbors!")
    return nn_model, vectorizer

# ========================
# Collaborative Filtering
# ========================
def collaborative_filtering(merged_data):
    """
    Builds a user-item matrix and applies collaborative filtering using Nearest Neighbors.
    Returns:
        nn_model (NearestNeighbors): User similarity model
        user_item_matrix (csr_matrix): Sparse matrix representing user interactions
        user_id_map (dict): Mapping of user IDs to matrix indices
        pratilipi_id_map (dict): Mapping of pratilipi IDs to matrix indices
    """
    # Mapping user and story IDs
    unique_users = merged_data['user_id'].unique()
    unique_pratilipi = merged_data['pratilipi_id'].unique()

    user_id_map = {id: i for i, id in enumerate(unique_users)}
    pratilipi_id_map = {id: i for i, id in enumerate(unique_pratilipi)}

    merged_data['user_index'] = merged_data['user_id'].map(user_id_map)
    merged_data['pratilipi_index'] = merged_data['pratilipi_id'].map(pratilipi_id_map)
    
    # Create user-item matrix
    row_indices = merged_data['user_index'].values
    col_indices = merged_data['pratilipi_index'].values
    data_values = merged_data['read_binary'].values
    user_item_matrix = csr_matrix((data_values, (row_indices, col_indices)), shape=(len(user_id_map), len(pratilipi_id_map)))

    nn_model = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=10, n_jobs=-1)
    nn_model.fit(user_item_matrix)
    
    print(" User Similarity Computed Using Nearest Neighbors!")
    return nn_model, user_item_matrix, user_id_map, pratilipi_id_map

# ========================
# Recommendation Function
# ========================
def generate_recommendations(user_id, user_nn_model, user_item_matrix, story_nn_model, vectorizer, meta_data, user_id_map, pratilipi_id_map, top_n=5):
    """
    Generates recommendations using collaborative and content-based filtering.
    Returns:
        recommendations (list of tuples): List of (Pratilipi ID, Category)
    """
    if user_id not in user_id_map:
        print(f" User {user_id} not found in dataset. Returning random recommendations.")
        return []
    
    user_index = user_id_map[user_id]
    user_vector = user_item_matrix[user_index].reshape(1, -1)
    
    distances, similar_users = user_nn_model.kneighbors(user_vector, n_neighbors=top_n + 1)
    similar_users = similar_users.flatten()[1:]
    
    cf_recommendations = np.argsort(-user_item_matrix[similar_users].sum(axis=0)).A1[:top_n * 2]
    
    reverse_pratilipi_id_map = {v: k for k, v in pratilipi_id_map.items()}
    converted_pratilipi_ids = [reverse_pratilipi_id_map.get(i, -1) for i in cf_recommendations]
    
    recommendations = [(pid, meta_data.loc[meta_data['pratilipi_id'] == pid, 'category_name'].values[0] if pid in meta_data['pratilipi_id'].values else "Unknown") for pid in converted_pratilipi_ids[:top_n]]
    
    return recommendations


def main():
    merged_data, meta_data = load_and_preprocess_data()
    if merged_data is None:
        print("Error: Data loading failed. Exiting.")
        return
    
    user_nn_model, user_item_matrix, user_id_map, pratilipi_id_map = collaborative_filtering(merged_data)
    story_nn_model, vectorizer = content_based_filtering(meta_data)
    
    test_users = [5506791961876448, 5506791994361621, 5506791978844819]
    for user_id in test_users:
        recommendations = generate_recommendations(user_id, user_nn_model, user_item_matrix, story_nn_model, vectorizer, meta_data, user_id_map, pratilipi_id_map, top_n=5)
        print(f"Top 5 recommendations for user {user_id}: {recommendations}")

if __name__ == "__main__":
    main()


User Interaction Columns: Index(['user_id', 'pratilipi_id', 'read_percent', 'updated_at'], dtype='object')
Meta Data Columns: Index(['author_id', 'pratilipi_id', 'category_name', 'reading_time',
       'updated_at', 'published_at'],
      dtype='object')

🔎 Unique Users in Merged Data: 211141
🔎 Is User 5506791961876448 Present? True

📊 First 5 Entries of Merged Dataset:
            user_id      pratilipi_id  read_percent             updated_at_x  \
0  5506791961876448  1377786228262109         100.0  2022-03-22 10:29:57.291   
1  5506791961876448  1377786228262109         100.0  2022-03-22 10:29:57.291   
2  5506791961876448  1377786228262109         100.0  2022-03-22 10:29:57.291   
3  5506791966456696  1377786228262109         100.0  2022-03-21 07:11:52.070   
4  5506791966456696  1377786228262109         100.0  2022-03-21 07:11:52.070   

          author_id category_name  reading_time         updated_at_y  \
0 -2270332349684758        novels           376  2022-03-15 18:39:52   
1 