In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import json

In [3]:
# Load the data
users = pd.read_csv('data/users.csv')
contents = pd.read_csv('data/contents.csv')
views = pd.read_csv('data/content_views.csv')
votes = pd.read_csv('data/user_votes.csv')

In [4]:
# Load the data
users.head()

Unnamed: 0,portal_user_id,age_range,gender,tags
0,beta-71,36-45,male,
1,centralina-204,26-35,female,
2,caac-194,18-25,female,
3,caac-50,18-25,male,
4,centralina-198,18-25,female,


In [5]:
contents=contents.drop_duplicates(subset=['global_id', 'entity_id', 'portal_name'])

In [6]:
# Load the data
contents.head()

Unnamed: 0,global_id,entity_id,content_type,content_title,content_topic,portal_name
0,214,214,article,Introduction to Aging and Dementia,Brain Health,caac
1,243,243,article,"Delirium, Depression, and Apathy",Brain Health,caac
2,284,284,article,Communication and Dementia,Brain Health,caac
3,285,285,article,Sexuality and Dementia,Brain Health,caac
4,332,332,article,Latest Research on Exercise and Brain Health,,caac


In [7]:
# Load the data
views.head()

Unnamed: 0,portal_user_id,global_id,entity_id,event_type,num_of_views
0,aaa1b-202,138.0,6,content_view,2
1,aaa1b-202,1161.0,555,content_view,1
2,aaa1b-202,1919.0,914,content_view,2
3,aaa1b-205,138.0,6,content_view,9
4,aaa1b-205,1919.0,914,content_view,4


In [8]:
# Load the data
votes.head()

Unnamed: 0,portal_user_id,vote,global_id
0,demo-554,1,481
1,wisconsin-2064,1,885
2,dayton-1241,1,740
3,kern-3412,1,733
4,ssvtcoa-232,1,405


In [9]:
# Function to extract portal name from portal_user_id
def get_portal_name(user_id):
    return user_id.split('-')[0]

In [10]:
# Ensure that tags are preprocessed
def preprocess_tags(tag_string):
    if pd.isnull(tag_string) or tag_string in ['null', '[]']:
        return []
    try:
        tags = json.loads(tag_string)
        return tags if len(tags) > 0 else []
    except (TypeError, json.JSONDecodeError):
        return []

In [11]:
# Preprocess user tags
users['tags'] = users['tags'].apply(preprocess_tags)

In [12]:
# Create a user attributes DataFrame
user_attributes = users.set_index('portal_user_id')

In [13]:
user_attributes.head()

Unnamed: 0_level_0,age_range,gender,tags
portal_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beta-71,36-45,male,[]
centralina-204,26-35,female,[]
caac-194,18-25,female,[]
caac-50,18-25,male,[]
centralina-198,18-25,female,[]


In [14]:
# Function to compute tag similarity
def compute_tag_similarity(user_tags, other_tags):
    user_tags_set = set(user_tags)
    other_tags_set = set(other_tags)
    if not user_tags_set and not other_tags_set:
        return 0
    return len(user_tags_set.intersection(other_tags_set)) / len(user_tags_set.union(other_tags_set))

In [15]:
# Function to compute user similarity based on attributes
def compute_user_similarity(target_user_id, user_attributes):
    target_user = user_attributes.loc[target_user_id]
    similarities = {}

    for user_id, attributes in user_attributes.iterrows():
        if user_id == target_user_id:
            continue

        age_similarity = 1 if target_user['age_range'] == attributes['age_range'] else 0
        gender_similarity = 1 if target_user['gender'] == attributes['gender'] else 0

        # Check if tags are null or contain 'null' or '[]' and handle accordingly
        if pd.isnull(target_user['tags']).any() or any(tag in target_user['tags'] for tag in ['null', '[]']):
            tag_similarity = 0
        else:
            tag_similarity = compute_tag_similarity(target_user['tags'], attributes['tags'])

        # Weighted sum of similarities (adjust weights as necessary)
        overall_similarity = 0.3 * age_similarity + 0.3 * gender_similarity + 0.4 * tag_similarity
        similarities[user_id] = overall_similarity

    return pd.Series(similarities)

In [18]:
# Function to get recommendations
def get_recommendations(user_id, num_recommendations=5, locale='en'):
    portal_name = get_portal_name(user_id)

    if not portal_name:
        print(f"No views found for user {user_id}. Cannot determine portal_name.")
        return None
    
    # Filter contents and views by the identified portal_name
    contents_filtered = contents[contents['portal_name'] == portal_name]
    views_filtered = views[views['global_id'].isin(contents_filtered['global_id'])]

    # Merge views with contents to get the necessary content details
    views_filtered = views_filtered.merge(contents_filtered[['global_id', 'content_title']], on='global_id')

    # Create user-content interaction matrix
    user_content_matrix = views_filtered.pivot_table(index='portal_user_id', columns='content_title', values='num_of_views', fill_value=0)

    # Convert the interaction matrix to a sparse matrix
    sparse_user_content_matrix = csr_matrix(user_content_matrix.values)

    # Compute cosine similarity between users based on content views
    user_similarity = cosine_similarity(sparse_user_content_matrix)

    # Convert similarity matrix to DataFrame for easier manipulation
    user_similarity_df = pd.DataFrame(user_similarity, index=user_content_matrix.index, columns=user_content_matrix.index)

    # Check if the user exists in the similarity matrix
    if user_id not in user_similarity_df.index:
        print(f"User {user_id} not found in similarity matrix.")
        return None

    # Compute user similarity based on attributes
    user_attr_similarity = compute_user_similarity(user_id, user_attributes)

    # Combine similarities
    combined_similarity = 0.5 * user_similarity_df.loc[user_id] + 0.5 * user_attr_similarity
    combined_similarity = combined_similarity.dropna().sort_values(ascending=False)

    # Find similar users
    similar_users = combined_similarity.index[:10]  # Take top 10 similar users for example

    # Get the content viewed by similar users
    similar_users_views = user_content_matrix.loc[similar_users]

    # Sum the views across similar users
    similar_users_views_sum = similar_users_views.sum(axis=0)

    # Get user votes
    user_votes = votes[votes['portal_user_id'] == user_id]
    upvoted_content = user_votes[user_votes['vote'] == 1]['global_id'].tolist()
    downvoted_content = user_votes[user_votes['vote'] == 0]['global_id'].tolist()

    # Filter out downvoted content
    similar_users_views_sum = similar_users_views_sum[~similar_users_views_sum.index.isin(downvoted_content)]

    # Boost the score for upvoted content
    for content_id in upvoted_content:
        if content_id in similar_users_views_sum.index:
            similar_users_views_sum[content_id] *= 1.5  # Adjust the boost factor as necessary

    # Remove contents the target user has already viewed
    if user_id in user_content_matrix.index:
        user_views = user_content_matrix.loc[user_id]
        similar_users_views_sum = similar_users_views_sum[user_views == 0]

    # Recommend the top N contents
    recommendations = similar_users_views_sum.sort_values(ascending=False).head(num_recommendations)

    # Merge with the contents dataframe to get additional details
    recommended_contents = contents_filtered[contents_filtered['content_title'].isin(recommendations.index)]

    # Calculate similarity percentages
    user_similarity_scores = user_similarity_df.loc[user_id, similar_users].mean()
    attr_similarity_scores = user_attr_similarity[similar_users].mean()
    
    total_similarity = user_similarity_scores + attr_similarity_scores
    user_similarity_percentage = (user_similarity_scores / total_similarity) * 100
    attr_similarity_percentage = (attr_similarity_scores / total_similarity) * 100

    # Create a DataFrame to hold the recommendation scores
    recommendation_scores = pd.DataFrame(index=recommended_contents['content_title'])
    recommendation_scores['user_similarity'] = recommendations.loc[recommendation_scores.index]
    recommendation_scores['attribute_similarity'] = 0  # Placeholder for attribute similarity

    for content_title in recommendation_scores.index:
        attribute_score = 0
        for similar_user in similar_users:
            attribute_score += user_attr_similarity.loc[similar_user] * user_content_matrix.loc[similar_user, content_title]
        
        recommendation_scores['attribute_similarity'] = recommendation_scores['attribute_similarity'].astype('float64')
        recommendation_scores.at[content_title, 'attribute_similarity'] = round(attribute_score, 2)

    recommendation_scores['total_score'] = recommendation_scores.sum(axis=1)
    recommendation_scores['user_similarity_percentage'] = round(user_similarity_percentage,2)
    recommendation_scores['attribute_similarity_percentage'] = round(attr_similarity_percentage,2)

     # Calculate vote similarity percentage
    vote_similarity_percentage = (recommendation_scores['total_score'] / recommendation_scores['total_score'].sum()) * 100

    recommendation_scores['vote_similarity_percentage'] = round(vote_similarity_percentage, 2)

    # Merge recommendation scores with recommended contents
    recommended_contents = recommended_contents.merge(recommendation_scores, left_on='content_title', right_index=True)

    # Return the recommendations with additional details and scores
    return recommended_contents[['global_id', 'entity_id', 'content_type', 'content_title', 'content_topic', 'portal_name', 'total_score']]

In [19]:
# Example: Get recommendations for a specific user
user_id = 'aaa1b-207'
get_recommendations(user_id,4)

Unnamed: 0,global_id,entity_id,content_type,content_title,content_topic,portal_name,total_score
24327,375,369,elearn,Emergency Planning Workbook,,aaa1b,43.85
25390,370,364,story,One Fall Away,,aaa1b,13.95
26947,986,423,elearn,Trualta Companion Cards,,aaa1b,21.92
32097,1214,564,article,Missing Doses & Medication Mistakes,Medications,aaa1b,17.9
