In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Paths to synthetic data
exercise_recent_activity_path = 'Synthetic_Recent_Activity_for_Exercise.csv'
user_profiles_path = 'Synthetic_User_Profiles_for_Exercise.csv'
exercise_main_dataset_path = 'dataset.csv'
exercise_name_dataset_path = 'exercise_name.csv'

# Import necessary libraries

# Load datasets
exercise_recent_activity_df = pd.read_csv(exercise_recent_activity_path)
user_profiles_df = pd.read_csv(user_profiles_path)
exercise_main_df = pd.read_csv(exercise_main_dataset_path)
exercise_name_df = pd.read_csv(exercise_name_dataset_path)

In [4]:
exercise_main_df.head()

Unnamed: 0,ID,Exercise,Calories Burn,Dream Weight,Actual Weight,Age,Gender,Duration,Heart Rate,BMI,Weather Conditions,Exercise Intensity
0,1,Exercise 2,286.959851,91.892531,96.301115,45,Male,37,170,29.426275,Rainy,5
1,2,Exercise 7,343.453036,64.165097,61.104668,25,Male,43,142,21.286346,Rainy,5
2,3,Exercise 4,261.223465,70.846224,71.766724,20,Male,20,148,27.899592,Cloudy,4
3,4,Exercise 5,127.183858,79.477008,82.984456,33,Male,39,170,33.729552,Sunny,10
4,5,Exercise 10,416.318374,89.960226,85.643174,29,Female,34,118,23.286113,Cloudy,3


In [6]:
user_profiles_df

Unnamed: 0,User_Id,Age,Gender,Preferred Intensity,Fitness Goal,Preferred Duration
0,User_1,47,Female,4,Endurance,69
1,User_2,67,Male,1,Weight Loss,43
2,User_3,31,Female,6,Muscle Gain,19
3,User_4,58,Male,2,Endurance,76
4,User_5,48,Female,3,Weight Loss,50
...,...,...,...,...,...,...
95,User_96,23,Male,1,Weight Loss,48
96,User_97,40,Male,5,Endurance,39
97,User_98,30,Female,2,Endurance,47
98,User_99,43,Female,4,Endurance,46


In [7]:
exercise_recent_activity_df

Unnamed: 0,User_Id,Exercise_Id,Rated,Liked,Performed,Duration,Timestamp
0,User_1,85,1,0,1,61,2025-01-01 23:45:06
1,User_1,55,0,0,1,79,2025-01-20 02:41:20
2,User_1,315,1,0,1,76,2025-01-24 21:52:35
3,User_1,138,1,0,1,40,2025-01-09 16:36:33
4,User_1,197,0,1,1,42,2025-01-01 08:19:21
...,...,...,...,...,...,...,...
958,User_100,326,0,0,1,28,2025-01-23 13:37:50
959,User_100,364,0,1,0,48,2025-01-07 10:51:17
960,User_100,228,0,1,0,57,2025-01-17 01:04:47
961,User_100,127,1,0,0,85,2025-01-21 11:32:30


In [12]:
# Combining insights from synthetic data with the main dataset
# Merge recent activity with the exercise dataset
merged_activity_exercise = exercise_recent_activity_df.merge(
    exercise_main_df, left_on='Exercise_Id', right_on='ID', how='left'
)

# Merge with user profiles
full_merged_df = merged_activity_exercise.merge(
    user_profiles_df, on='User_Id', how='left'
)

In [13]:
full_merged_df.columns

Index(['User_Id', 'Exercise_Id', 'Rated', 'Liked', 'Performed', 'Duration_x',
       'Timestamp', 'ID', 'Exercise', 'Calories Burn', 'Dream Weight',
       'Actual Weight', 'Age_x', 'Gender_x', 'Duration_y', 'Heart Rate', 'BMI',
       'Weather Conditions', 'Exercise Intensity', 'Age_y', 'Gender_y',
       'Preferred Intensity', 'Fitness Goal', 'Preferred Duration'],
      dtype='object')

In [14]:
full_merged_df

Unnamed: 0,User_Id,Exercise_Id,Rated,Liked,Performed,Duration_x,Timestamp,ID,Exercise,Calories Burn,...,Duration_y,Heart Rate,BMI,Weather Conditions,Exercise Intensity,Age_y,Gender_y,Preferred Intensity,Fitness Goal,Preferred Duration
0,User_1,85,1,0,1,61,2025-01-01 23:45:06,85,Exercise 1,364.730556,...,24,176,18.850743,Cloudy,1,47,Female,4,Endurance,69
1,User_1,55,0,0,1,79,2025-01-20 02:41:20,55,Exercise 6,371.121626,...,52,146,34.294775,Sunny,3,47,Female,4,Endurance,69
2,User_1,315,1,0,1,76,2025-01-24 21:52:35,315,Exercise 7,481.371463,...,21,167,26.660304,Cloudy,6,47,Female,4,Endurance,69
3,User_1,138,1,0,1,40,2025-01-09 16:36:33,138,Exercise 4,237.618971,...,23,137,24.935610,Sunny,1,47,Female,4,Endurance,69
4,User_1,197,0,1,1,42,2025-01-01 08:19:21,197,Exercise 7,497.851502,...,47,110,34.381530,Sunny,2,47,Female,4,Endurance,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
958,User_100,326,0,0,1,28,2025-01-23 13:37:50,326,Exercise 6,399.988001,...,30,115,19.123278,Cloudy,5,20,Male,8,Weight Loss,39
959,User_100,364,0,1,0,48,2025-01-07 10:51:17,364,Exercise 10,168.340270,...,26,144,32.578675,Rainy,4,20,Male,8,Weight Loss,39
960,User_100,228,0,1,0,57,2025-01-17 01:04:47,228,Exercise 5,191.117193,...,41,117,19.251129,Sunny,9,20,Male,8,Weight Loss,39
961,User_100,127,1,0,0,85,2025-01-21 11:32:30,127,Exercise 6,475.418906,...,36,161,22.219450,Sunny,2,20,Male,8,Weight Loss,39


## Recommender system

In [15]:
# Pick columns that characterize the exercise itself numerically
exercise_features = exercise_main_df[[
    'ID',
    'Calories Burn',
    'Duration',
    'Heart Rate',
    'BMI',
    'Exercise Intensity'
]].copy()

In [17]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.1-cp312-cp312-macosx_14_0_arm64.whl (24.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
In

In [18]:
# Normalize or scale numeric columns so that no single feature dominates the similarity calculation:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = ['Calories Burn','Duration','Heart Rate','BMI','Exercise Intensity']
exercise_features[numeric_cols] = scaler.fit_transform(exercise_features[numeric_cols])

In [19]:
X = exercise_features[numeric_cols].values
exercise_ids = exercise_features['ID'].values

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Pairwise similarity between all exercises
exercise_sim = cosine_similarity(X, X)

# exercise_sim[i][j] will give similarity between 
# exercise i and exercise j in your dataset.

In [23]:
import numpy as np

def get_similar_exercises(exercise_id, top_n=5):
    # Find index of the exercise in the matrix
    idx = np.where(exercise_ids == exercise_id)[0][0]
    
    # Similarities for that exercise to all others
    similarity_scores = list(enumerate(exercise_sim[idx]))
    
    # Sort by descending similarity
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Skip the first one if it’s the exercise itself (similarity=1)
    top_matches = similarity_scores[1:top_n+1]
    
    # Return the exercise IDs for those top matches
    recommended_ids = [exercise_ids[i] for (i, score) in top_matches]
    return recommended_ids

In [24]:
def content_based_recommender(user_id, top_n=5):
    # 1) Identify some exercises the user has liked or performed
    user_liked_df = exercise_recent_activity_df[
        (exercise_recent_activity_df['User_Id'] == user_id) & 
        (exercise_recent_activity_df['Liked'] == 1)
    ]
    
    if user_liked_df.empty:
        # If user has no history, just recommend top popular or any
        return list(exercise_ids[:top_n])
    
    # 2) Take the most recent liked exercise (for example)
    last_liked_exercise_id = user_liked_df.iloc[-1]['Exercise_Id']
    
    # 3) Get top similar
    similar = get_similar_exercises(last_liked_exercise_id, top_n=top_n*2)
    
    # 4) Optionally filter by user’s preferred duration / intensity
    user_profile = user_profiles_df[user_profiles_df['User_Id'] == user_id].iloc[0]
    pref_intensity = user_profile['Preferred Intensity']
    pref_duration = user_profile['Preferred Duration']
    
    # Let’s say we keep exercises whose intensity is within ±2 of user’s preferred:
    # (This is just an example heuristic—adjust as needed)
    filtered = exercise_main_df[
        (exercise_main_df['ID'].isin(similar)) &
        (exercise_main_df['Exercise Intensity'].between(pref_intensity-2, pref_intensity+2)) &
        (exercise_main_df['Duration'].between(pref_duration-10, pref_duration+10))
    ]['ID'].values
    
    # Return top_n from that filtered list
    return list(filtered[:top_n])

## Rystems: Content based

In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# 1) Pick your numeric columns describing each exercise:
exercise_features = exercise_main_df[[
    'ID', 
    'Calories Burn', 
    'Duration', 
    'Heart Rate', 
    'BMI',
    'Exercise Intensity'
]].copy()

# 2) Scale these columns so no single feature dominates similarity
scaler = MinMaxScaler()
cols = ['Calories Burn', 'Duration', 'Heart Rate', 'BMI', 'Exercise Intensity']
exercise_features[cols] = scaler.fit_transform(exercise_features[cols])

# 3) Build a feature matrix X and keep the 'ID' separately
X = exercise_features[cols].values
exercise_ids = exercise_features['ID'].values

# 4) Compute pairwise similarities between all exercises (item-item similarity)
exercise_sim = cosine_similarity(X, X)

In [26]:
def get_similar_exercises(exercise_id, top_n=5):
    """
    Given an exercise_id, return the top_n most similar exercise IDs
    (excluding the exercise_id itself).
    """
    # Find index of the exercise in the matrix
    idx = np.where(exercise_ids == exercise_id)[0][0]
    
    # Similarities for that exercise vs. all others
    similarity_scores = list(enumerate(exercise_sim[idx]))
    
    # Sort by descending similarity
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Slice off the first if it’s the item itself
    top_matches = similarity_scores[1:top_n+1]
    
    # Return the corresponding exercise IDs
    recommended_ids = [exercise_ids[i] for (i, _) in top_matches]
    return recommended_ids

In [27]:
def content_based_recommender(user_id, top_n=5):
    """
    For the given user, look at their recent activity to find a 
    'liked' exercise. Then recommend the top_n similar exercises.
    """
    # 1) Get the user's 'Liked' exercises from exercise_recent_activity_df
    user_liked_df = exercise_recent_activity_df[
        (exercise_recent_activity_df['User_Id'] == user_id) &
        (exercise_recent_activity_df['Liked'] == 1)
    ]
    
    # 2) If no liked history, just return the first few exercises (fallback)
    if user_liked_df.empty:
        return list(exercise_ids[:top_n])
    
    # 3) Otherwise, take the most recently liked exercise
    last_liked_exercise_id = user_liked_df.iloc[-1]['Exercise_Id']
    
    # 4) Find top similar exercises
    recommended = get_similar_exercises(last_liked_exercise_id, top_n=top_n)
    
    return recommended

In [28]:
# Test the recommender for a specific user, e.g. 'User_1'
print("Content-Based Recommendations for User_1:")
recs = content_based_recommender('User_1', top_n=5)
print(recs)

Content-Based Recommendations for User_1:
[np.int64(2478), np.int64(1923), np.int64(2653), np.int64(2311), np.int64(807)]


## Rystems: COllab

In [29]:
import pandas as pd

# Start with a copy so we can manipulate freely
cf_data = exercise_recent_activity_df.copy()

# Create a 'Rating' = Liked
cf_data['Rating'] = cf_data['Liked']

# Pivot to get matrix:
# rows = users, columns = exercises, values = rating
user_item_matrix = cf_data.pivot_table(
    index='User_Id',
    columns='Exercise_Id',
    values='Rating'
).fillna(0)

user_item_matrix.head()  # Inspect

Exercise_Id,1,2,4,5,6,7,8,9,10,11,...,489,490,491,492,493,494,495,497,499,500
User_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
User_12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert pivot table to numpy array
user_item_matrix_values = user_item_matrix.values

# user_sim_matrix[u][v] = similarity between user u and user v
user_sim_matrix = cosine_similarity(user_item_matrix_values, user_item_matrix_values)

# Turn it into a DataFrame with user IDs as row and column labels
user_sim_df = pd.DataFrame(user_sim_matrix,
                           index=user_item_matrix.index,
                           columns=user_item_matrix.index)

In [31]:
def recommend_exercises_cf(user_id, top_n=5, k=5):
    """
    Recommend top_n exercises for the given user_id using 
    user-based collaborative filtering with k neighbors.
    """
    # 1. Get similarity scores for this user to all other users
    #    (Drop the user's similarity to themselves)
    sim_scores = user_sim_df.loc[user_id].drop(user_id)
    # Sort by descending similarity
    sim_scores_sorted = sim_scores.sort_values(ascending=False)
    
    # 2. Pick the top-k most similar users
    top_neighbors = sim_scores_sorted.head(k).index
    
    # 3. For these neighbors, retrieve their rows from user_item_matrix
    neighbor_ratings = user_item_matrix.loc[top_neighbors]
    
    # 4. Compute a weighted average of these neighbors’ item ratings
    #    (weight by similarity to the user)
    weights = sim_scores_sorted.head(k).values.reshape(-1, 1)  # shape(k, 1)
    
    # Weighted sum across neighbors, then normalize by sum of similarities
    weighted_scores = neighbor_ratings.multiply(weights, axis=0).sum(axis=0) / weights.sum()
    
    # 5. Exclude items the user has already liked
    user_history = user_item_matrix.loc[user_id]
    already_liked = user_history[user_history > 0].index  # items with rating=1
    weighted_scores = weighted_scores.drop(already_liked, errors='ignore')
    
    # 6. Recommend the top_n items
    recommended_exercises = weighted_scores.sort_values(ascending=False).head(top_n).index
    
    return recommended_exercises.tolist()

In [32]:
# Example: Recommend 5 exercises for user_id 'User_1'
user_id_test = 'User_1'
cf_recommendations = recommend_exercises_cf(user_id_test, top_n=5, k=5)
print(f"CF Recommendations for {user_id_test}: {cf_recommendations}")

CF Recommendations for User_1: [371, 125, 388, 316, 433]


## Rystss: Item based colab filtre

In [33]:
import pandas as pd

# We'll assume you have 'Liked' or 'Rated' in exercise_recent_activity_df
# For simplicity, let's assume 'Liked' is 0 or 1 and we treat that as a rating:
cf_data = exercise_recent_activity_df.copy()
cf_data['Rating'] = cf_data['Liked']

# Pivot
user_item_matrix = cf_data.pivot_table(
    index='User_Id',
    columns='Exercise_Id',
    values='Rating'
).fillna(0)

user_item_matrix.head()  # Inspect

Exercise_Id,1,2,4,5,6,7,8,9,10,11,...,489,490,491,492,493,494,495,497,499,500
User_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User_11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
User_12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
from sklearn.metrics.pairwise import cosine_similarity

# 1) Transpose so each row is an exercise, each column a user
item_user_matrix = user_item_matrix.T  # shape: (#exercises, #users)

# 2) Compute item-item similarity
item_sim_matrix = cosine_similarity(item_user_matrix.values, item_user_matrix.values)

# 3) Turn into a DataFrame for convenience:
exercise_ids = item_user_matrix.index  # all the exercise IDs
item_sim_df = pd.DataFrame(item_sim_matrix, 
                           index=exercise_ids, 
                           columns=exercise_ids)

In [35]:
def recommend_exercises_itemcf(user_id, top_n=5):
    """
    Recommend exercises for user_id based on item-based CF.
    """
    # 1) Get the user's rating (like) vector
    user_ratings = user_item_matrix.loc[user_id]  # Series of exercise => 0/1
    
    # 2) Find the items (exercises) the user liked
    liked_items = user_ratings[user_ratings > 0].index
    
    # If no liked items, return a fallback
    if len(liked_items) == 0:
        return user_item_matrix.columns[:top_n].tolist()
    
    # 3) For each liked item, get its similarity vector from item_sim_df
    #    Then multiply similarity by the user's rating for that item (which is 1 for liked)
    #    Summation approach:
    sim_scores = pd.Series(0, index=item_sim_df.columns, dtype=float)  # track total similarity scores for all items
    
    for item in liked_items:
        # item_sim_df.loc[item] is a row of similarity to all other items
        sim_scores = sim_scores.add(item_sim_df.loc[item] * user_ratings[item], fill_value=0)
    
    # 4) Remove items the user already liked
    sim_scores = sim_scores.drop(liked_items, errors='ignore')
    
    # 5) Sort by descending similarity
    sim_scores = sim_scores.sort_values(ascending=False)
    
    # 6) Return the top_n
    return sim_scores.head(top_n).index.tolist()

In [36]:
user_test = 'User_1'
itemcf_recs = recommend_exercises_itemcf(user_test, top_n=5)
print(f"Item-Based CF Recommendations for {user_test}:", itemcf_recs)

Item-Based CF Recommendations for User_1: [125, 392, 388, 316, 179]
