# Hybrid Video Recommender System

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# 1. Data Reading and Aggregation

In [2]:
file_path = 'Hybrid_Reco_Videos_Data.xlsx'

video_df = pd.read_excel(file_path, sheet_name="videos' info")
rate_data = pd.read_excel(file_path, sheet_name="filtered_rate_data")
user_fav = pd.read_excel(file_path, sheet_name="user_favorite_videos")

# Rating aggregation: average score and number of raters for each video
video_ratings = (
    rate_data.groupby('video_id')
    .agg(avg_rating=('rating_score', 'mean'),
         num_ratings=('rating_score', 'count'))
    .reset_index()
)

# Merging video metadata
video_popularity = video_ratings.merge(video_df[['video_id', 'title', 'specialities']], on='video_id', how='left')

# 2. Popularity Recommendation Module

### Function Objective: 
Based on the user's rating records of the videos, filter out the videos with ratings greater than a certain threshold from the full amount of videos, and sort them by average rating and number of raters, and return the top N most popular videos.
### Steps:

In [3]:
def get_top_popular_videos(video_popularity, n=10, min_ratings=5):
    filtered = video_popularity[video_popularity['num_ratings'] >= min_ratings]
    ranked = filtered.sort_values(['avg_rating', 'num_ratings'], ascending=[False, False])
    return ranked[['video_id', 'title', 'avg_rating', 'num_ratings']].head(n)


In [4]:
# Get Top N Popularity Recommended Videos
top_popular = get_top_popular_videos(video_popularity, n=10, min_ratings=5)

print("Top N Most Popular Videos by Average Rating and Number of Ratings:")
display(top_popular)

Top N Most Popular Videos by Average Rating and Number of Ratings:


Unnamed: 0,video_id,title,avg_rating,num_ratings
2537,42505,Cardio Training Video #42505,4.72,5
1245,20905,Weight Loss Training Video #20905,4.46,5
4568,76102,Powerlifting Training Video #76102,4.38,5
4958,82210,Pilates Training Video #82210,4.38,5
3948,65804,Rehabilitation Training Video #65804,4.34,5
233,4007,Flexibility Training Video #4007,4.32,5
252,4304,Pilates Training Video #4304,4.3,5
2137,35805,Strength Training Training Video #35805,4.285714,7
1617,27009,Rehabilitation Training Video #27009,4.28,5
175,3006,CrossFit Training Video #3006,4.26,5


# 3. Content-Based Recommendation Module

### Tagged Text Vectorisation (TF-IDF) 
### Steps:

In [5]:
video_df['specialities'] = video_df['specialities'].fillna('')
vectorizer = TfidfVectorizer(token_pattern=r"[^|]+")
tfidf_matrix = vectorizer.fit_transform(video_df['specialities'])
indices = pd.Series(video_df.index, index=video_df['title']).drop_duplicates()

### Function of Content-Based Top-N similar video recommendations

Based on the tagged features (specialities) of the video that the user is currently interested in, the TF-IDF vector and cosine similarity are used to find the most similar other videos and the top n recommendations are output.
### Steps:

In [6]:
def get_content_based_recommendations(title, n=10):
    if title not in indices:
        return pd.DataFrame()
    idx = indices[title]
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [s for s in sim_scores if s[0] != idx][:n]
    video_indices = [i[0] for i in sim_scores]
    result = video_df.iloc[video_indices][['video_id', 'title', 'specialities']].copy()
    result['score'] = [s[1] for s in sim_scores]
    return result

### Sample Output

In [7]:
content_title = "Yoga Training Video #101"
result = get_content_based_recommendations(content_title, n=100)

print(f"Top {len(result)} Content-Based Recommendations for '{content_title}':")
display(result[['video_id', 'title', 'score']])

Top 100 Content-Based Recommendations for 'Yoga Training Video #101':


Unnamed: 0,video_id,title,score
25,306,Yoga Training Video #306,1.00000
112,1203,Yoga Training Video #1203,1.00000
191,2002,Yoga Training Video #2002,1.00000
872,8803,Yoga Training Video #8803,1.00000
878,8809,Yoga Training Video #8809,1.00000
...,...,...,...
110,1201,Yoga Training Video #1201,0.81344
1778,17809,Flexibility Training Video #17809,0.81344
4663,46704,Rehabilitation Training Video #46704,0.81344
6528,65309,Flexibility Training Video #65309,0.81344


In [8]:
content_title = "Sports-Specific Training Video #46601"
result = get_content_based_recommendations(content_title, n=100)

print(f"Top {len(result)} Content-Based Recommendations for '{content_title}':")
display(result[['video_id', 'title', 'score']])

Top 100 Content-Based Recommendations for 'Sports-Specific Training Video #46601':


Unnamed: 0,video_id,title,score
45,506,Bodybuilding Training Video #506,1.00000
406,4107,Bodybuilding Training Video #4107,1.00000
630,6401,Bodybuilding Training Video #6401,1.00000
633,6404,Bodybuilding Training Video #6404,1.00000
1139,11410,Sports-Specific Training Video #11410,1.00000
...,...,...,...
1896,19007,Sports-Specific Training Video #19007,0.81647
2160,21701,CrossFit Training Video #21701,0.81647
2873,28804,CrossFit Training Video #28804,0.81647
4860,48701,Bodybuilding Training Video #48701,0.81647


# 4. Collaborative Filtering Module

### SVD method based on matrix decomposition 

In [9]:
data_df = rate_data[['user_id', 'video_id', 'rating_score']].copy()

# Surprise format
reader = Reader(rating_scale=(data_df['rating_score'].min(), data_df['rating_score'].max()))
data = Dataset.load_from_df(data_df, reader)
# Model training
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
svd = SVD(random_state=42)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2015aed8df0>

### Top-N recommendation function implementation

It generates a personalised list of video recommendations for a single user based on SVD model

### Function Objective: 
For a given user user_id, predict his/her rating for unwatched videos and return the top n recommended videos sorted by predicted score.

### Steps:

In [10]:
def get_collaborative_video_recommendations(user_id, n=10):
    all_videos = set(rate_data['video_id'].unique())
    seen_videos = set(rate_data[rate_data['user_id'] == user_id]['video_id'].values)
    unseen_videos = list(all_videos - seen_videos)
    if len(unseen_videos) == 0:
        return pd.DataFrame()
    predictions = [(vid, svd.predict(user_id, vid).est) for vid in unseen_videos]
    recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    result = pd.DataFrame(recommendations, columns=['video_id', 'predicted_rating'])
    result = result.merge(video_df[['video_id', 'title']], on='video_id', how='left')
    result['score'] = result['predicted_rating']
    return result[['video_id', 'title', 'score']]

### Sample Output

In [11]:
user_id = 'U0435'
collab_results = get_collaborative_video_recommendations(user_id, n=10)

print(f"Top {len(collab_results)} Collaborative Filtering Recommendations for User '{user_id}':")
display(collab_results[['video_id', 'title', 'score']])


Top 10 Collaborative Filtering Recommendations for User 'U0435':


Unnamed: 0,video_id,title,score
0,42505,Cardio Training Video #42505,4.183276
1,3006,CrossFit Training Video #3006,4.13266
2,28303,HIIT Training Video #28303,4.131339
3,16202,Strength Training Training Video #16202,4.123423
4,76102,Powerlifting Training Video #76102,4.118227
5,66105,Pilates Training Video #66105,4.107672
6,20905,Weight Loss Training Video #20905,4.105511
7,75206,Powerlifting Training Video #75206,4.098392
8,61309,Prenatal Fitness Training Video #61309,4.086171
9,7110,Sports-Specific Training Video #7110,4.082703


### Evaluation

In [12]:
from surprise import accuracy

predictions = svd.test(testset)

rmse_score = accuracy.rmse(predictions, verbose=True)  
mae_score = accuracy.mae(predictions, verbose=True)    

RMSE: 0.9639
MAE:  0.7817


# Hierarchical Weighted Hybrid Recommendations: Automatic Cold Start Triage + Weight Fusion 

Combines Popularity, Collaborative Filtering, and Content-Based recommendations with cold-start fallback logic.

### Normalisation tool function 
Ensure that the three types of recommendation scores are comparable when fused.

In [13]:
def minmax_normalize(series):
    return (series - series.min()) / (series.max() - series.min() + 1e-8)

### Weighted hybrid recommendation function

In [28]:
import pandas as pd
import numpy as np

def minmax_normalize(series):
    return (series - series.min()) / (series.max() - series.min() + 1e-8)

def safe_fill(df, col):
    if col not in df.columns:
        df[col] = 0
    return df

def hybrid_with_fallback(
        user_id,
        video_popularity, video_df, favorite_df,
        popularity_weight=0.33, content_weight=0.33, collaborative_weight=0.33, n=10):
    # Find user's favorite video title
    fav_row = favorite_df.loc[favorite_df['user_id'] == user_id, 'favorite_title']
    user_favorite_title = fav_row.values[0] if not fav_row.empty and pd.notna(fav_row.values[0]) else None

    # Determine whether the user has rating history
    user_has_history = user_id in rate_data['user_id'].unique()

    # --- Popularity-Based Module ---
    popular_videos = get_top_popular_videos(video_popularity, n*10)[['video_id', 'title', 'avg_rating']]
    popular_videos['popularity_score'] = minmax_normalize(popular_videos['avg_rating'])
    popular_videos = safe_fill(popular_videos, 'popularity_score')

    # Cold-start fallback: users with no history only get popularity-based recommendations
    if not user_has_history:
        popular_videos['hybrid_score'] = popular_videos['popularity_score'] * popularity_weight
        popular_videos = popular_videos.sort_values('hybrid_score', ascending=False).head(n).reset_index(drop=True)
        # Ensure all score columns are present for merging in subsequent steps
        popular_videos = safe_fill(popular_videos, 'collab_score')
        popular_videos = safe_fill(popular_videos, 'content_score')
        return popular_videos[['video_id', 'title', 'hybrid_score', 'popularity_score', 'collab_score', 'content_score']]

    # --- Collaborative Filtering Module ---
    collaborative_recs = get_collaborative_video_recommendations(user_id, n*10)[['video_id', 'title', 'score']].copy()
    if collaborative_recs.empty:
        collaborative_recs['collab_score'] = 0
    else:
        collaborative_recs['collab_score'] = minmax_normalize(collaborative_recs['score'])
    collaborative_recs = safe_fill(collaborative_recs, 'collab_score')

    # --- Content-Based Module ---
    content_recs = pd.DataFrame()
    if user_favorite_title and user_favorite_title in indices:
        content_recs = get_content_based_recommendations(user_favorite_title, n*10)[['video_id', 'title', 'score']].copy()
        if not content_recs.empty:
            content_recs['content_score'] = minmax_normalize(content_recs['score'])
        else:
            content_recs['content_score'] = 0
    else:
        # Explicitly return a correctly structured empty DataFrame to avoid errors in subsequent steps
        content_recs = pd.DataFrame(columns=['video_id', 'title', 'content_score'])
    content_recs = safe_fill(content_recs, 'content_score')

    # --- Merge three types of recommendation results ---
    merged = pd.concat([
        popular_videos[['video_id', 'title', 'popularity_score']],
        collaborative_recs[['video_id', 'title', 'collab_score']],
        content_recs[['video_id', 'title', 'content_score']]
    ], ignore_index=True).fillna(0).infer_objects()

    # Remove duplicates and aggregate to avoid multiple scores for the same video_id
    final = merged.groupby(['video_id', 'title'], as_index=False).agg({
        'popularity_score': 'max',
        'collab_score': 'max',
        'content_score': 'max'
    }).fillna(0)

    # Calculate the final hybrid score
    final['hybrid_score'] = (
        final['popularity_score'] * popularity_weight +
        final['collab_score'] * collaborative_weight +
        final['content_score'] * content_weight
    )

    # Output Top-N
    final = final.sort_values('hybrid_score', ascending=False).head(n).reset_index(drop=True)
    # Ensure output column order is complete and correct
    return final[['video_id', 'title', 'hybrid_score', 'popularity_score', 'collab_score', 'content_score']]


### Sample Output

In [29]:
user_id = 'U0019'
hybrid_results = hybrid_with_fallback(
    user_id=user_id,
    video_popularity=video_popularity,
    video_df=video_df,
    favorite_df=user_fav,    
    popularity_weight=0.33,
    content_weight=0.33,
    collaborative_weight=0.33,
    n=10
)
display(hybrid_results)


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
0,42505,Cardio Training Video #42505,0.491059,1.0,0.488058,0.0
1,76102,Powerlifting Training Video #76102,0.474335,0.66,0.777378,0.0
2,90301,CrossFit Training Video #90301,0.4092,0.24,1.0,0.0
3,27009,Rehabilitation Training Video #27009,0.373894,0.56,0.573013,0.0
4,65804,Rehabilitation Training Video #65804,0.350774,0.62,0.442952,0.0
5,3006,CrossFit Training Video #3006,0.347438,0.54,0.512842,0.0
6,15708,Bodybuilding Training Video #15708,0.33,0.0,0.0,1.0
7,409,Pilates Training Video #409,0.33,0.0,0.0,1.0
8,93804,Cardio Training Video #93804,0.33,0.0,0.0,1.0
9,31008,Bodybuilding Training Video #31008,0.33,0.0,0.0,1.0
