In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the DataFrame
file_name = 'articles_small.parquet'
articles_df = pd.read_parquet(file_name)
history_df = pd.read_parquet('history_small_train.parquet')
behaviors_df = pd.read_parquet('behaviors_small_train.parquet')
history_validation_df = pd.read_parquet('history_small_validation.parquet')


# Filter articles by the date range from April 27 to June 8, 2023
filtered_articles_df = articles_df[
    (articles_df['published_time'] >= '2023-04-27') & (articles_df['published_time'] <= '2023-06-08')
]

print(filtered_articles_df.head())

filtered_articles_df = filtered_articles_df.dropna()

# Check for null values in specified columns
null_values = filtered_articles_df[['total_pageviews', 'total_inviews', 'sentiment_label', 'total_read_time']].isnull().sum()
print(null_values)

# Explode article_ids_clicked to prepare for the join
behaviors_df_exploded = behaviors_df.explode('article_ids_clicked')

# Drop rows with NaN values after exploding
behaviors_df_exploded = behaviors_df_exploded.dropna(subset=['article_ids_clicked'])

# Perform the inner join
merged_df = pd.merge(
    filtered_articles_df,
    behaviors_df_exploded,
    left_on='article_id',
    right_on='article_ids_clicked',
    how='inner'
)

# List of columns to keep, with user_id first
columns_to_keep = [
    'user_id', 'premium', 'category_str', 'read_time', 'scroll_percentage',
    'device_type', 'article_ids_clicked', 'is_subscriber', 'next_read_time'
]

# Keep only the desired columns and rearrange them
filtered_merged_df = merged_df[columns_to_keep]

# Group by user_id and aggregate other columns appropriately
grouped_df = filtered_merged_df.groupby('user_id').agg({
    'premium': list,
    'category_str': list,
    'read_time': list,
    'scroll_percentage': list,
    'device_type': 'first',
    'article_ids_clicked': list,
    'is_subscriber': 'first',
    'next_read_time': list
}).reset_index()

# Drop rows with any null values in grouped_df
grouped_df = grouped_df.dropna()

print(grouped_df.head())

# Create a dictionary with user_id as keys and clicked articles as values
user_articles = merged_df.groupby('user_id')['article_ids_clicked'].apply(list).to_dict()

# Create a combined feature string for each user
user_features = {
    user_id: ' '.join(map(str, articles)) + f" subscriber_{row['is_subscriber']} device_{row['device_type']}"
    for user_id, articles, row in zip(user_articles.keys(), user_articles.values(), grouped_df.to_dict('records'))
}

# Create a matrix of users and their clicked articles including additional features
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
user_feature_matrix = vectorizer.fit_transform(user_features.values())

# Calculate cosine similarity between users
user_cosine_sim = cosine_similarity(user_feature_matrix)

# Convert cosine similarity matrix to DataFrame
user_ids = list(user_features.keys())
user_cosine_sim_df = pd.DataFrame(user_cosine_sim, index=user_ids, columns=user_ids)

# Function to get top N neighbors for a user
def get_top_neighbors(user_id, top_n=5):
    if user_id not in user_cosine_sim_df.index:
        return []
    neighbors = user_cosine_sim_df[user_id].nlargest(top_n + 1).iloc[1:].index
    return neighbors

# Function to get article recommendations for a user, incorporating article ranking
def recommend_articles(user_id, top_n=10):
    neighbors = get_top_neighbors(user_id, top_n)
    neighbor_articles = [article for neighbor in neighbors for article in user_articles[neighbor]]
    neighbor_articles_counts = pd.Series(neighbor_articles).value_counts()
    
    # Create a DataFrame for articles and their counts
    article_scores_df = neighbor_articles_counts.reset_index()
    article_scores_df.columns = ['article_id', 'interaction_count']
    
    # Merge with the filtered_df to get the rank of the articles
    filtered_df = articles_df[['article_id', 'total_pageviews', 'sentiment_label', 'sentiment_score']].copy()

    # Normalize the total_pageviews column
    filtered_df['normalized_total_pageviews'] = filtered_df['total_pageviews'] / filtered_df['total_pageviews'].max()

    # Create a mapping for sentiment_label
    sentiment_mapping = {1: 1, 0: 0.5, -1: -1}

    # Map the sentiment_label to sentiment_label_value and multiply by sentiment_score
    filtered_df['sentiment_label_value'] = filtered_df['sentiment_label'].map(sentiment_mapping)
    filtered_df['adjusted_sentiment_score'] = filtered_df['sentiment_score'] * filtered_df['sentiment_label_value']

    # Normalize the adjusted_sentiment_score column
    filtered_df['normalized_adjusted_sentiment_score'] = (filtered_df['adjusted_sentiment_score'] - filtered_df['adjusted_sentiment_score'].min()) / (filtered_df['adjusted_sentiment_score'].max() - filtered_df['adjusted_sentiment_score'].min())

    # Combine the normalized_total_pageviews and normalized_adjusted_sentiment_score into a combined_score
    filtered_df['combined_score'] = 0.8 * filtered_df['normalized_total_pageviews'] + 0.2 * filtered_df['normalized_adjusted_sentiment_score']

    # Sort the DataFrame based on combined_score in descending order
    filtered_df = filtered_df.sort_values('combined_score', ascending=False)

    # Normalize the combined_score to get the normalized_rank
    filtered_df['normalized_rank'] = filtered_df['combined_score'] / filtered_df['combined_score'].max()

    # Merge the normalized_rank back into article_scores_df
    article_scores_df = article_scores_df.merge(filtered_df[['article_id', 'normalized_rank']], on='article_id', how='left')

    # Define the weights for interaction count and normalized rank
    interaction_count_weight = 0.9
    normalized_rank_weight = 0.1

# Calculate the combined score with the weights
    article_scores_df['combined_score'] = (interaction_count_weight * article_scores_df['interaction_count']) + (normalized_rank_weight * (1 - article_scores_df['normalized_rank']))

# Normalize the combined score if needed
    article_scores_df['combined_score'] = article_scores_df['combined_score'] / article_scores_df['combined_score'].max()

# Sort articles based on the combined score
    article_scores_df = article_scores_df.sort_values('combined_score', ascending=False)

    # Get the top N recommended articles
    recommended_articles = article_scores_df['article_id'].head(top_n)

    return recommended_articles.tolist()

# Get recommendations for each user in the grouped_df
grouped_df['recommended_articles'] = grouped_df['user_id'].apply(lambda user_id: recommend_articles(user_id, top_n=15))

# Ensure the history_validation_df only contains user IDs that exist in the grouped_df
history_validation_df = history_validation_df[history_validation_df['user_id'].isin(grouped_df['user_id'])]

# Function to calculate AUC for a user
def calculate_auc(user_id):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = grouped_df[grouped_df['user_id'] == user_id]['recommended_articles'].values[0]
    y_true = [1 if article in actual_articles else 0 for article in predicted_articles]
    y_scores = list(range(len(predicted_articles), 0, -1))
    if len(set(y_true)) == 1:  # Avoid cases where all true labels are the same
        return 0
    return roc_auc_score(y_true, y_scores)

# Function to calculate MRR for a user
def calculate_mrr(user_id):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = grouped_df[grouped_df['user_id'] == user_id]['recommended_articles'].values[0]
    for rank, article in enumerate(predicted_articles, start=1):
        if article in actual_articles:
            return 1 / rank
    return 0

# Function to calculate nDCG@K for a user
def calculate_ndcg(user_id, k=15):
    actual_articles = history_validation_df[history_validation_df['user_id'] == user_id]['article_id_fixed'].values
    if actual_articles.size == 0:
        return 0
    actual_articles = set(actual_articles[0])
    predicted_articles = grouped_df[grouped_df['user_id'] == user_id]['recommended_articles'].values[0][:k]
    dcg = sum((1 / (i + 1) if article in actual_articles else 0) for i, article in enumerate(predicted_articles))
    idcg = sum(1 / (i + 1) for i in range(min(len(actual_articles), k)))
    return dcg / idcg if idcg > 0 else 0

# Function to tune parameters and calculate metrics
def tune_parameters(grouped_df, history_validation_df):
    top_neighbors_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    top_recommendations_values = [5]
    
    best_auc = 0
    best_mrr = 0
    best_ndcg5 = 0
    best_params = {}

    for top_n in top_neighbors_values:
        for top_k in top_recommendations_values:
            grouped_df['recommended_articles'] = grouped_df['user_id'].apply(lambda user_id: recommend_articles(user_id, top_n=top_n))
            grouped_df['auc'] = grouped_df['user_id'].apply(calculate_auc)
            grouped_df['mrr'] = grouped_df['user_id'].apply(calculate_mrr)
            grouped_df['ndcg@5'] = grouped_df['user_id'].apply(lambda user_id: calculate_ndcg(user_id, k=top_k))
            
            average_auc = grouped_df['auc'].mean()
            average_mrr = grouped_df['mrr'].mean()
            average_ndcg5 = grouped_df['ndcg@5'].mean()

            if average_auc > best_auc and average_mrr > best_mrr and average_ndcg5 > best_ndcg5:
                best_auc = average_auc
                best_mrr = average_mrr
                best_ndcg5 = average_ndcg5
                best_params = {'top_n': top_n, 'top_k': top_k}
            
            print(f"Top N: {top_n}, Top K: {top_k}, Average AUC: {average_auc}, Average MRR: {average_mrr}, Average nDCG@5: {average_ndcg5}")

    print(f"Best Parameters: {best_params}")
    print(f"Best AUC: {best_auc}, Best MRR: {best_mrr}, Best nDCG@5: {best_ndcg5}")

# Run the tuning function
tune_parameters(grouped_df, history_validation_df)

# Save the grouped DataFrame to parquet
grouped_df.to_parquet('grouped_df.parquet')


       article_id                                  title  \
11059     9647575      Kvinderne der kan sparke din røv!   
11508     9667501         Advarsel! Bliv inden døre i år   
11943     9685759  Realitystjerner: - Sådan scorer du os   
11948     9685856      Realitystjerner: Det tænder vi på   
12063     9690920                  Dansk og dødbringende   

                                                subtitle  last_modified_time  \
11059  Blot fordi der står supermodel, sanger eller s... 2023-06-29 06:47:06   
11508                                FESTIVALGUIDE 2023: 2023-06-29 06:47:22   
11943  Vi har spurgte en række realitystjerner, hvord... 2023-06-29 06:47:38   
11948  Find ud af, om du har det, der skal til, hvis ... 2023-06-29 06:47:38   
12063  Hos Dansk Industri har man store forhåbninger ... 2023-06-29 06:47:43   

       premium                                               body  \
11059     True  Det er sjældent noget, de skilter med, men der...   
11508    False  Det 