# Item-Based CF

## Score Calculation Logic

#### Click score:

The number of all clicks made by a user on a trainer × 0.1

#### Watch score:

The sum of the watch_rates of all the videos of a user on a trainer × 0.3

For example, if 6 videos are watched and the watch_rate is 3.2, the score is 3.2 × 0.3 = 0.96.

#### Rating score (introducing penalties):

A user of a trainer of all the videos rating_score and the sum of the difference between 3 points (i.e., rating minus 3), such as rating below 3 is negative, higher than 3 is positive

Sum of all differences × 0.6

For example, 3 ratings: 2.6, 3.8, 4.2, respectively, compared with 3, get -0.4, 0.8, 1.2, and the sum is 1.6, score = 1.6 × 0.6 = 0.96

#### final preference score:

For each user-physio combination, sum the three scores (click+watch+rating)

#### Output results:

A ‘total preference score’ is obtained for each user for each physio.

### Choose Top 50 physios for each user based on final scores

In [1]:
import pandas as pd
import numpy as np

# Read data
df = pd.read_excel('user_video_interaction.xlsx')

# Data preprocessing (ensure empty strings are converted to np.nan)
df['watch_rate'] = pd.to_numeric(df['watch_rate'], errors='coerce')
df['rating_score'] = pd.to_numeric(df['rating_score'], errors='coerce')

# Weights
CLICK_WEIGHT = 0.1
WATCH_WEIGHT = 0.3
RATING_WEIGHT = 0.6

# Calculate click score
click_df = df[df['action_type'] == 'click'].groupby(['user_id', 'trainer_id']).size().reset_index(name='click_count')
click_df['click_score'] = click_df['click_count'] * CLICK_WEIGHT

# Calculate watch score
watch_df = df[df['action_type'] == 'watch'].groupby(['user_id', 'trainer_id'])['watch_rate'].sum().reset_index()
watch_df['watch_score'] = watch_df['watch_rate'] * WATCH_WEIGHT

# Calculate rating score
def rating_func(series):
    return ((series - 3)).sum()
rating_df = df[df['action_type'] == 'rate'].groupby(['user_id', 'trainer_id'])['rating_score'].apply(rating_func).reset_index()
rating_df['rating_score_adj'] = rating_df['rating_score'] * RATING_WEIGHT

# Merge all scores
scores = pd.DataFrame({'user_id': df['user_id'].unique()})
trainers = pd.DataFrame({'trainer_id': df['trainer_id'].unique()})
all_user_trainer = scores.assign(key=1).merge(trainers.assign(key=1), on='key').drop('key', axis=1)

result = all_user_trainer.merge(click_df[['user_id', 'trainer_id', 'click_score']], how='left', on=['user_id', 'trainer_id'])
result = result.merge(watch_df[['user_id', 'trainer_id', 'watch_score']], how='left', on=['user_id', 'trainer_id'])
result = result.merge(rating_df[['user_id', 'trainer_id', 'rating_score_adj']], how='left', on=['user_id', 'trainer_id'])

# Fill missing values with 0
for col in ['click_score', 'watch_score', 'rating_score_adj']:
    result[col] = result[col].fillna(0)

result['total_score'] = result['click_score'] + result['watch_score'] + result['rating_score_adj']

# Select Top 50 trainers for each user
result['rank'] = result.groupby('user_id')['total_score'].rank(method='first', ascending=False)
top50 = result[result['rank'] <= 50].copy()

# Sort by user_id and rank
top50 = top50.sort_values(['user_id', 'rank'])

# Keep only necessary columns
top50_final = top50[['user_id', 'trainer_id', 'click_score', 'watch_score', 'rating_score_adj', 'total_score', 'rank']]

# Export
top50_final.to_excel('user_top50_trainer.xlsx', index=False)


# Colaborative Filtering

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_excel('user_top50_trainer.xlsx')

# Constructing a User-Trainer Rating Matrix
pivot = df.pivot_table(index='user_id', columns='trainer_id', values='total_score', fill_value=0)

# Calculate cosine similarity between items (trainers)
trainer_sim = cosine_similarity(pivot.T)
trainer_sim_df = pd.DataFrame(trainer_sim, index=pivot.columns, columns=pivot.columns)

# Recommended Functions
def get_item_cf_recommendations(pivot, trainer_sim_df, top_n=10):
    user_reco = {}
    for user in pivot.index:
        user_scores = pivot.loc[user]
        liked_trainers = set(user_scores[user_scores > 0].index)
        candidate_scores = {}
        for trainer in liked_trainers:
            sim_series = trainer_sim_df.loc[trainer].drop(liked_trainers, errors='ignore')
            for other_trainer, sim_score in sim_series.items():
                candidate_scores[other_trainer] = candidate_scores.get(other_trainer, 0) + sim_score
        sorted_candidates = sorted(candidate_scores.items(), key=lambda x: -x[1])
        top_recos = [t for t, s in sorted_candidates[:top_n]]
        top_scores = [s for t, s in sorted_candidates[:top_n]]
        user_reco[user] = list(zip(top_recos, top_scores))
    return user_reco

item_cf_recos = get_item_cf_recommendations(pivot, trainer_sim_df, top_n=50)

# DataFrame
user_list = []
trainer_list = []
sim_score_list = []
for user, recos in item_cf_recos.items():
    for trainer, score in recos:
        user_list.append(user)
        trainer_list.append(trainer)
        sim_score_list.append(score)
item_cf_df = pd.DataFrame({'user_id': user_list, 'recommended_trainer_id': trainer_list, 'sim_score': sim_score_list})


item_cf_df.to_excel('itemcf_user_reco_top50.xlsx', index=False)
item_cf_df.head()


Unnamed: 0,user_id,recommended_trainer_id,sim_score
0,U0000,471,2.008653
1,U0000,609,2.002484
2,U0000,133,1.954364
3,U0000,744,1.931092
4,U0000,423,1.907392


## standardisation

Normalise the sim_score for subsequent hybrid recommendations to avoid inconsistent scales

In [6]:
import pandas as pd

df = pd.read_excel('itemcf_user_reco_top50.xlsx')

# Calculate mean and standard deviation
mean = df['sim_score'].mean()
std = df['sim_score'].std()

# Z-score
df['sim_score_z'] = (df['sim_score'] - mean) / std

# Truncation to the interval [-2, 2] to reduce the effect of extreme values
df['sim_score_z_clip'] = df['sim_score_z'].clip(-2, 2)

# Maps to the interval [0, 1].
df['sim_score_norm'] = (df['sim_score_z_clip'] + 2) / 4

# Export of standardised data
df.to_excel('itemcf_user_reco_top50_zscore_norm.xlsx', index=False)

print(df[['sim_score', 'sim_score_z', 'sim_score_norm']].head())


   sim_score  sim_score_z  sim_score_norm
0   2.008653     0.906159        0.726540
1   2.002484     0.861613        0.715403
2   1.954364     0.514163        0.628541
3   1.931092     0.346121        0.586530
4   1.907392     0.175001        0.543750
