In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the DataFrame
file_name = 'articles_small.parquet'
df = pd.read_parquet(file_name)
history_df = pd.read_parquet('history_small_train.parquet')
behaviors_df = pd.read_parquet('behaviors_small_train.parquet')
history_validation_df = pd.read_parquet('history_small_validation.parquet')

# Ensure required columns
required_columns = ['article_id', 'published_time', 'total_inviews', 'total_pageviews', 'total_read_time', 'premium', 'sentiment_label']
if not all(col in df.columns for col in required_columns):
    raise KeyError(f"Required columns {required_columns} not found in the dataset.")

# Convert and filter published time
df['published_time'] = pd.to_datetime(df['published_time'])
df = df[df['published_time'] > '2023-02-23']

# Drop rows with any null values
df = df.dropna(subset=['category_str', 'total_inviews', 'total_read_time', 'sentiment_label', 'topics'])

# Encode category_str using OneHotEncoder
category_encoder = OneHotEncoder()
category_encoded = category_encoder.fit_transform(df[['category_str']]).toarray()

# Normalize total_inviews and total_read_time
scaler = MinMaxScaler()
numerical_data = scaler.fit_transform(df[['total_inviews', 'total_read_time']])

# Encode sentiment_label
sentiment_mapping = {'Negative': -1, 'Neutral': 0, 'Positive': 1}
df['sentiment_label'] = df['sentiment_label'].map(sentiment_mapping)

# Vectorize topics using TfidfVectorizer
topics = df['topics'].apply(lambda x: ' '.join(x))
vectorizer = TfidfVectorizer()
topics_encoded = vectorizer.fit_transform(topics).toarray()

# Combine all features into a single feature set
combined_features = np.hstack((category_encoded, numerical_data, df[['sentiment_label']].values.reshape(-1, 1), topics_encoded))

# Compute cosine similarity for articles
article_cosine_sim_matrix = cosine_similarity(combined_features)

# Filter articles for recommendations
filtered_df = df[['article_id', 'total_pageviews', 'sentiment_label']].copy()
filtered_df.loc[:, 'normalized_read_time'] = filtered_df['total_pageviews'] / filtered_df['total_pageviews'].max()
filtered_df.loc[:, 'sentiment_label_value'] = filtered_df['sentiment_label'].map({1: 1, 0: 0.5, -1: 0})
filtered_df.loc[:, 'combined_score'] = 0.6 * filtered_df['normalized_read_time'] + 0.4 * filtered_df['sentiment_label_value']
filtered_df = filtered_df.sort_values('combined_score', ascending=False)
filtered_df.loc[:, 'normalized_rank'] = filtered_df['combined_score']

# Filter behaviors_basic dataframe based on distinct user_id and non-subscribers
filtered_behaviors_df = behaviors_df.drop_duplicates(subset=['user_id'])
filtered_behaviors_df = filtered_behaviors_df[['user_id', 'is_subscriber', 'device_type']]

# Perform inner join with history_df on user_id
merged_df = pd.merge(history_df, filtered_behaviors_df, on='user_id', how='inner')
merged_df = merged_df.drop(columns=['scroll_percentage_fixed', 'impression_time_fixed', 'read_time_fixed'])
merged_df['is_subscriber'] = merged_df['is_subscriber'].astype(int)

# Create a dictionary with user_id as keys and clicked articles as values
user_articles = merged_df.set_index('user_id')['article_id_fixed'].to_dict()


# Create a combined feature string for each user
user_features = {
    user_id: ' '.join(map(str, articles)) + f" subscriber_{row['is_subscriber']} device_{row['device_type']}"
    for user_id, articles, row in zip(user_articles.keys(), user_articles.values(), merged_df.to_dict('records'))
}

# Create a matrix of users and their clicked articles including additional features
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
user_feature_matrix = vectorizer.fit_transform(user_features.values())

# Calculate cosine similarity between users
user_cosine_sim = cosine_similarity(user_feature_matrix)

# Convert cosine similarity matrix to DataFrame
user_ids = list(user_features.keys())
user_cosine_sim_df = pd.DataFrame(user_cosine_sim, index=user_ids, columns=user_ids)

# Function to get top N neighbors for a user
def get_top_neighbors(user_id, top_n=5):
    if user_id not in user_cosine_sim_df.index:
        return []
    neighbors = user_cosine_sim_df[user_id].nlargest(top_n + 1).iloc[1:].index
    return neighbors

# Function to get article recommendations for a user, incorporating article ranking
def recommend_articles(user_id, top_n=10):
    neighbors = get_top_neighbors(user_id, top_n)
    neighbor_articles = [article for neighbor in neighbors for article in user_articles[neighbor]]
    neighbor_articles_counts = pd.Series(neighbor_articles).value_counts()
    
    # Create a DataFrame for articles and their counts
    article_scores_df = neighbor_articles_counts.reset_index()
    article_scores_df.columns = ['article_id', 'interaction_count']
    
    # Merge with the filtered_df to get the rank of the articles
    article_scores_df = article_scores_df.merge(filtered_df[['article_id', 'normalized_rank']], on='article_id', how='left')
    
    # Calculate a combined score
    article_scores_df['combined_score'] = article_scores_df['interaction_count'] * (1 - article_scores_df['normalized_rank'])
    
    # Sort articles based on the combined score
    article_scores_df = article_scores_df.sort_values('combined_score', ascending=False)
    
    # Get the top N recommended articles
    recommended_articles = article_scores_df['article_id'].head(top_n)
    return recommended_articles.tolist()

# Get recommendations for each user in the merged_df
merged_df['recommended_articles'] = merged_df['user_id'].apply(lambda user_id: recommend_articles(user_id, top_n=15))

# Ensure the history_validation_df only contains user IDs that exist in the merged_df
history_validation_df = history_validation_df[history_validation_df['user_id'].isin(merged_df['user_id'])]

# Function to calculate AUC for a user
def calculate_auc(user_id):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = merged_df[merged_df['user_id'] == user_id]['recommended_articles'].values[0]
    y_true = [1 if article in actual_articles else 0 for article in predicted_articles]
    y_scores = list(range(len(predicted_articles), 0, -1))
    if len(set(y_true)) == 1:  # Avoid cases where all true labels are the same
        return 0
    return roc_auc_score(y_true, y_scores)

# Function to calculate MRR for a user
def calculate_mrr(user_id):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = merged_df[merged_df['user_id'] == user_id]['recommended_articles'].values[0]
    for rank, article in enumerate(predicted_articles, start=1):
        if article in actual_articles:
            return 1 / rank
    return 0

# Function to calculate nDCG@K for a user
def calculate_ndcg(user_id, k=15):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = merged_df[merged_df['user_id'] == user_id]['recommended_articles'].values[0][:k]
    dcg = sum((1 / (i + 1) if article in actual_articles else 0) for i, article in enumerate(predicted_articles))
    idcg = sum(1 / (i + 1) for i in range(min(len(actual_articles), k)))
    return dcg / idcg if idcg > 0 else 0

# Function to tune parameters and calculate metrics
def tune_parameters(merged_df, history_validation_df, user_articles):
    top_neighbors_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    top_recommendations_values = [5]
    
    best_auc = 0
    best_mrr = 0
    best_ndcg5 = 0
    best_params = {}

    for top_n in top_neighbors_values:
        for top_k in top_recommendations_values:
            merged_df['recommended_articles'] = merged_df['user_id'].apply(lambda user_id: recommend_articles(user_id, top_n=top_n))
            merged_df['auc'] = merged_df['user_id'].apply(calculate_auc)
            merged_df['mrr'] = merged_df['user_id'].apply(calculate_mrr)
            merged_df['ndcg@5'] = merged_df['user_id'].apply(lambda user_id: calculate_ndcg(user_id, k=top_k))
            
            average_auc = merged_df['auc'].mean()
            average_mrr = merged_df['mrr'].mean()
            average_ndcg5 = merged_df['ndcg@5'].mean()

            if average_auc > best_auc and average_mrr > best_mrr and average_ndcg5 > best_ndcg5:
                best_auc = average_auc
                best_mrr = average_mrr
                best_ndcg5 = average_ndcg5
                best_params = {'top_n': top_n, 'top_k': top_k}
            
            print(f"Top N: {top_n}, Top K: {top_k}, Average AUC: {average_auc}, Average MRR: {average_mrr}, Average nDCG@5: {average_ndcg5}")

    print(f"Best Parameters: {best_params}")
    print(f"Best AUC: {best_auc}, Best MRR: {best_mrr}, Best nDCG@5: {best_ndcg5}")

# Run the tuning function
tune_parameters(merged_df, history_validation_df, user_articles)

# Save the filtered articles dataframe to parquet
filtered_df.to_parquet('filtered_articles_by_total_read_time.parquet')


Top N: 5, Top K: 5, Average AUC: 0.46863237139272274, Average MRR: 0.6089469281736335, Average nDCG@5: 0.4232946577290306
Top N: 10, Top K: 5, Average AUC: 0.5026655592347661, Average MRR: 0.6352693213586035, Average nDCG@5: 0.4631348729810971
Top N: 15, Top K: 5, Average AUC: 0.5054794579046709, Average MRR: 0.6437630221078019, Average nDCG@5: 0.48291304034035015
Top N: 20, Top K: 5, Average AUC: 0.5072723110573589, Average MRR: 0.649763134056631, Average nDCG@5: 0.4953617591480573
Top N: 25, Top K: 5, Average AUC: 0.5104260530546013, Average MRR: 0.6528766014347919, Average nDCG@5: 0.5031011159658589
Top N: 30, Top K: 5, Average AUC: 0.5135811246946943, Average MRR: 0.6543131653173135, Average nDCG@5: 0.5081315050883414
Top N: 35, Top K: 5, Average AUC: 0.5162782660793884, Average MRR: 0.6540869228374682, Average nDCG@5: 0.5108023929886552
Top N: 40, Top K: 5, Average AUC: 0.518431074324796, Average MRR: 0.6558471034594212, Average nDCG@5: 0.5139606058604936
Top N: 45, Top K: 5, Aver