In [8]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

users_df = pd.read_csv('../data_engineering/staging/transformed_users.csv')
courses_df = pd.read_csv('../data_engineering/staging/transformed_courses.csv')
enrollments_df = pd.read_csv('../data_engineering/staging/transformed_enrollment.csv')
performance_df = pd.read_csv('../data_engineering/staging/transformed_performance.csv')
learning_paths_df = pd.read_csv('../data_engineering/staging/transformed_learning_path.csv')

print(users_df.head())
print(courses_df.head())
print(enrollments_df.head())
print(performance_df.head())
print(learning_paths_df.head())


   id            name
0   5        John Doe
1   6      Jane Smith
2   7     Bob Johnson
3   8  Alice Williams
4   9   Charlie Brown
   id                 title  duration    difficulty  rating            domain
0   1       Cloud Computing   8 hours      advanced     3.3      Data Science
1   2       Web Development  42 hours      beginner     3.7   Web Development
2   3  Intro to Programming  31 hours      beginner     1.0  Computer Science
3   4       Cloud Computing  84 hours      advanced     1.6  Computer Science
4   5       Data Structures  72 hours  intermediate     3.8   Cloud Computing
   id  user_id  course_id enrollment_date completion_date
0   1       19         90      2024-06-26      2024-07-15
1   2        1         58      2024-09-21      2024-09-21
2   3       89         47      2024-08-14      2025-01-25
3   4       38         55      2023-11-29             NaN
4   5       69          3      2024-06-07      2024-07-03
   id  user_id  enrollment_id  score grade  progress

In [10]:
print(users_df.info())
print(courses_df.info())
print(enrollments_df.info())
print(performance_df.info())
print(learning_paths_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      50 non-null     int64 
 1   name    50 non-null     object
dtypes: int64(1), object(1)
memory usage: 932.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          100 non-null    int64  
 1   title       100 non-null    object 
 2   duration    100 non-null    object 
 3   difficulty  100 non-null    object 
 4   rating      100 non-null    float64
 5   domain      100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0 

In [11]:
enrollments_df['completion_date'].fillna(pd.to_datetime('today'), inplace=True)

user_course_df = pd.merge(enrollments_df, performance_df, on='id', how='left')

learning_path_df = pd.merge(learning_paths_df, courses_df, left_on='domain', right_on='domain', how='left')

print(user_course_df.isnull().sum())
print(learning_path_df.isnull().sum())


id                    0
user_id_x             0
course_id             0
enrollment_date       0
completion_date       0
user_id_y             0
enrollment_id         0
score                 0
grade                 0
progress              0
certificate_earned    0
dtype: int64
id_x             0
domain           0
title_x          0
courses          0
duration0        0
difficulty_x     0
id_y            20
title_y         20
duration        20
difficulty_y    20
rating          20
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  enrollments_df['completion_date'].fillna(pd.to_datetime('today'), inplace=True)


In [12]:
user_path_matrix = user_course_df.pivot_table(index='id', columns='course_id', values='score', fill_value=0)

print(user_path_matrix.head())


course_id   1     3    5    6    8    9    10   11   12   15  ...   81   83  \
id                                                            ...             
1          0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2          0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3          0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4          0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
5          0.0  57.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

course_id   84   85   86   87    90   94   95   99  
id                                                  
1          0.0  0.0  0.0  0.0  73.0  0.0  0.0  0.0  
2          0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  
3          0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  
4          0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  
5          0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  

[5 rows x 65 columns]


In [13]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [14]:

from sklearn.decomposition import TruncatedSVD
import numpy as np

svd = TruncatedSVD(n_components=10, random_state=42)  
matrix = svd.fit_transform(user_path_matrix)

reduced_matrix = pd.DataFrame(matrix, index=user_path_matrix.index)
print(reduced_matrix.head())


def recommend_courses(user_id, num_recommendations=5):
    user_idx = user_path_matrix.index.get_loc(user_id)

    user_vector = reduced_matrix.iloc[user_idx]

    similarities = np.dot(reduced_matrix, user_vector)

    similar_users = np.argsort(similarities)[::-1][1:num_recommendations + 1]

    recommended_courses = set()
    for sim_user in similar_users:
        recommended_courses.update(user_path_matrix.columns[(user_path_matrix.iloc[sim_user] > 0)])

    return list(recommended_courses)

user_id = 1  
recommendations = recommend_courses(user_id)
print(f"Recommended courses for user {user_id}: {recommendations}")

           0         1         2         3         4          5          6  \
id                                                                           
1   0.003479  0.004599  0.010575  0.064796 -0.036681  -0.042441   0.125935   
2   0.018729  0.009734  0.084770  0.253843 -0.357579  -0.114231   0.190195   
3   0.018154 -0.010328 -0.083142 -0.437394  0.416443  -0.658820  -0.558614   
4  -0.000397 -0.000588 -0.001325  0.002558  0.005361  -0.001338   0.008919   
5   0.060046  0.033703 -0.086976 -1.081342 -1.794742  45.420635 -29.338790   

            7         8         9  
id                                 
1   -0.165537 -0.096418  0.299961  
2   -0.796799 -0.237165 -0.251318  
3    1.043306 -2.964001  2.261585  
4   -0.003099  0.010753  0.009421  
5   12.608849  6.589204  1.857121  
Recommended courses for user 1: [64, 10, 83, 23]


In [22]:
import pandas as pd

def evaluate_recommendations(true_courses, recommended_courses):
    true_set = set(true_courses)
    recommended_set = set(recommended_courses)

    tp = len(true_set.intersection(recommended_set)) 
    fn = len(true_set - recommended_set)  

    precision = tp / len(recommended_set) if recommended_set else 0
    recall = tp / len(true_set) if true_set else 0
    accuracy = tp / len(recommended_set) if recommended_set else 0

    return precision, recall, accuracy, tp, fn

df = pd.read_csv("./staging/transformed_courses.csv")
true_courses = df['id'].tolist() 

user_id = 1 
recommended_courses = recommend_courses(user_id)

precision, recall, accuracy, tp, fn = evaluate_recommendations(true_courses, recommended_courses)
print(f"Precision: {precision}, Recall (Sensitivity): {recall}, Accuracy: {accuracy}, True Positives: {tp}, False Negatives: {fn}")


Precision: 1.0, Recall (Sensitivity): 0.04, Accuracy: 1.0, True Positives: 4, False Negatives: 96


In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

def evaluate_svd(n_components, interaction_matrix, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    precision_scores = []
    recall_scores = []

    for train_index, test_index in kf.split(interaction_matrix):
        train_data = interaction_matrix.iloc[train_index]
        test_data = interaction_matrix.iloc[test_index]

        svd = TruncatedSVD(n_components=n_components, random_state=42)
        train_matrix = svd.fit_transform(train_data)

        reduced_train_matrix = pd.DataFrame(train_matrix, index=train_data.index)

        for user_id in test_data.index:
            true_courses = test_data.columns[(test_data.loc[user_id] > 0)].tolist()
            recommendations = recommend_courses(user_id)  

            precision, recall = evaluate_recommendations(true_courses, recommendations)
            precision_scores.append(precision)
            recall_scores.append(recall)

    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)

    return avg_precision, avg_recall

component_range = [5, 10, 15, 20, 25]

results = {}

for n in component_range:
    avg_precision, avg_recall = evaluate_svd(n, user_path_matrix)
    results[n] = (avg_precision, avg_recall)
    print(f"n_components: {n} -> Average Precision: {avg_precision}, Average Recall: {avg_recall}")

best_n = max(results, key=lambda x: results[x][0]) 
print(f"Best n_components: {best_n} with Precision: {results[best_n][0]}, Recall: {results[best_n][1]}")


n_components: 5 -> Average Precision: 0.134, Average Recall: 0.44
n_components: 10 -> Average Precision: 0.134, Average Recall: 0.44
n_components: 15 -> Average Precision: 0.134, Average Recall: 0.44
n_components: 20 -> Average Precision: 0.134, Average Recall: 0.44
n_components: 25 -> Average Precision: 0.134, Average Recall: 0.44
Best n_components: 5 with Precision: 0.134, Recall: 0.44


In [27]:
from tabulate import tabulate

In [28]:

def calculate_kpis(true_courses, recommended_courses):
    true_set = set(true_courses)
    recommended_set = set(recommended_courses)

    tp = len(true_set.intersection(recommended_set))  
    fp = len(recommended_set - true_set)             
    fn = len(true_set - recommended_set)              

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def calculate_user_engagement(recommended_courses, total_users):
    engaged_users = len(set(recommended_courses)) 
    engagement_rate = (engaged_users / total_users) * 100 if total_users > 0 else 0
    return engagement_rate

def calculate_ctr(recommended_courses, total_recommendations_shown):
    clicks = len(set(recommended_courses))  
    ctr = (clicks / total_recommendations_shown) * 100 if total_recommendations_shown > 0 else 0
    return ctr

def calculate_completion_rate(true_courses, recommended_courses):
    true_set = set(true_courses)
    recommended_set = set(recommended_courses)
    
    completed_courses = len(true_set.intersection(recommended_set))  
    completion_rate = (completed_courses / len(recommended_courses)) * 100 if len(recommended_courses) > 0 else 0
    return completion_rate

def display_kpis(precision, recall, f1_score, engagement_rate, ctr, completion_rate):

    kpi_data = [
        ["Precision", f"{precision:.2f}"],
        ["Recall (Sensitivity)", f"{recall:.2f}"],
        ["F1 Score", f"{f1_score:.2f}"],
        ["Engagement Rate", f"{engagement_rate:.2f}%"],
        ["Click-Through Rate (CTR)", f"{ctr:.2f}%"],
        ["Course Completion Rate", f"{completion_rate:.2f}%"]
    ]
    
    print("\n--- Recommendation System KPIs ---")
    print(tabulate(kpi_data, headers=["Metric", "Value"], tablefmt="grid"))
    print("-----------------------------------\n")
df = pd.read_csv("../data_engineering/staging/transformed_courses.csv")
true_courses = df['id'].tolist() 

user_id = 1 
recommended_courses = recommend_courses(user_id)

precision, recall, f1_score = calculate_kpis(true_courses, recommended_courses)
total_users = len(true_courses)  
total_recommendations_shown = 100

engagement_rate = calculate_user_engagement(recommended_courses, total_users)
ctr = calculate_ctr(recommended_courses, total_recommendations_shown)
completion_rate = calculate_completion_rate(true_courses, recommended_courses)

display_kpis(precision, recall, f1_score, engagement_rate, ctr, completion_rate)



--- Recommendation System KPIs ---
+--------------------------+---------+
| Metric                   | Value   |
| Precision                | 1.00    |
+--------------------------+---------+
| Recall (Sensitivity)     | 0.04    |
+--------------------------+---------+
| F1 Score                 | 0.08    |
+--------------------------+---------+
| Engagement Rate          | 4.00%   |
+--------------------------+---------+
| Click-Through Rate (CTR) | 4.00%   |
+--------------------------+---------+
| Course Completion Rate   | 100.00% |
+--------------------------+---------+
-----------------------------------

