### Recommendations Engine Light FM Version
This is the main file for the recommendations engine that uses LightFM to predict recommendations for the user.


Note: Try and use this on a linux environment to make sure that everything is working properly.

In [336]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from lightfm.data import  Dataset

In [337]:
# Load the course datasets
course_df = pd.read_csv('data/Course.csv')
course_df.drop(columns=['createdAt', 'updatedAt', 'deletedAt'], axis=1, inplace=True)
course_interaction_df = pd.read_csv('data/CourseInteraction.csv')
course_interaction_df.drop(columns=['id','createdAt', 'updatedAt', 'deletedAt'], axis=1, inplace=True)

In [338]:
# Add a column to the course_interaction_df to indicate the number of times a user has interacted with a course
course_interaction_df['interaction_count'] = course_interaction_df.groupby(['userId', 'courseId'])['userId'].transform('count')


In [339]:
# Rename course id to courseId
course_df.rename(columns={'id':'courseId'}, inplace=True)
course_df.head()


Unnamed: 0,courseId,name,description,url
0,53,LearnSQL - Learn & Practice SQL,Filter by: · Learn SQL Basics for Data Science...,https://www.coursera.org/courses?query=sql
1,54,Google Cloud SQL - 90-day free trial,Top courses in SQL and Database Management · C...,https://www.udemy.com/topic/sql/
2,55,Best SQL Courses [2023],"In this SQL course, you'll learn how to manage...",https://www.codecademy.com/learn/learn-sql
3,56,Top SQL Courses Online - Updated [May 2023],"Apr 25, 2023 — Explore online SQL courses from...",https://www.edx.org/learn/sql
4,57,Learn SQL,"Nov 15, 2022 — Our Top 6 Online SQL Courses fo...",https://learnsql.com/blog/best-online-sql-cour...


#### Create a dataset for the recommendations engine

In [340]:
# Create a lightfm dataset
ds = Dataset()


# Fit the dataset
ds.fit(
    users=course_interaction_df['userId'],
    user_features=course_interaction_df['interaction_count'],
    items=course_interaction_df['courseId'],
)

ds.fit_partial(
    items=course_df['courseId'],
    item_features=course_df['name']
)

ds.fit_partial(
    item_features=course_df['description']
)

item_features = ds.build_item_features(((x['courseId'], [x['name'], x['description']]) for _, x in course_df.iterrows()))
user_features = ds.build_user_features(((x['userId'], [x['interaction_count']]) for _, x in course_interaction_df.iterrows()))
(interactions, weights) = ds.build_interactions(((x['userId'], x['courseId']) for _, x in course_interaction_df.iterrows()))

In [341]:
# Check the current shape of the dataset
print('Num users: {}, num_items {}.'.format(ds.interactions_shape()[0], ds.interactions_shape()[1]))

Num users: 12, num_items 3344.


In [342]:
# Split the data into training and test sets
from lightfm.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))
print('Num users: {}, num_items {}.'.format(train.shape[0], train.shape[1]))
print('Num users: {}, num_items {}.'.format(test.shape[0], test.shape[1]))

Num users: 12, num_items 3344.
Num users: 12, num_items 3344.


In [343]:
# Create a loop to test the model with different hyperparameters
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from itertools import product
from tqdm import tqdm


params = {
    'no_components': [10, 20, 40, 80, 160],
    'learning_schedule': ['adagrad', 'adadelta'],
    'loss': ['logistic', 'bpr', 'warp', 'warp-kos'],
    'learning_rate': [0.05, 0.1, 0.2, 0.5, 1.0],
    'item_alpha': [0.0, 0.0001, 0.0005, 0.001, 0.005],
    'user_alpha': [0.0, 0.0001, 0.0005, 0.001, 0.005],
    'max_sampled': [5, 10, 15, 20, 25],
    'num_epochs': [5, 10, 15, 20, 25],
    'random_state': [42]
}


param_grid = list(product(*params.values()))
print(len(param_grid))

# Create a function to test the model with different hyperparameters
def test_model(params):
    no_components, learning_schedule, loss, learning_rate, item_alpha, user_alpha, max_sampled, num_epochs, random_state = params
    model = LightFM(no_components=no_components,
                    learning_schedule=learning_schedule,
                    loss=loss,
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha,
                    max_sampled=max_sampled,
                    random_state=random_state)
    
    model.fit(train, epochs=num_epochs, num_threads=8)
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10).mean()
    return train_precision, test_precision

# Test the model with different hyperparameters
# results = []
# with tqdm(total=len(param_grid)) as pbar:
#     for i in range(len(param_grid)):
#         params = param_grid[i]
#         pbar.update(1)
#         pbar.set_postfix_str("Features: {}".format(params))
#         results.append(test_model(params))


# # Print the results
# results_df = pd.DataFrame(results, columns=['train_precision', 'test_precision'])
# results_df['params'] = param_grid
# results_df.sort_values('test_precision', ascending=False, inplace=True)

# # Save the results to a csv
# results_df.to_csv('data/results.csv', index=False)

# # Print the best results
# results_df.head(10)



125000


In [344]:
n_users, n_items = interactions.shape
print('Num users: {}, num_items {}.'.format(n_users, n_items))

Num users: 12, num_items 3344.


In [348]:
from lightfm.evaluation import auc_score, recall_at_k, reciprocal_rank
# 160, adadelta, bpr, 0.1, 0.001, 0.0001, 25, 2
model = LightFM(no_components=160,
                learning_schedule='adadelta',
                loss='warp',
                learning_rate=0.1,
                item_alpha=0.00001,
                user_alpha=0.001,
                max_sampled=35,
                random_state=42,
               )

model.fit(train, 
          user_features=user_features,
          item_features=item_features,
          epochs=20, num_threads=4)

score= auc_score(model, test, user_features=user_features, item_features=item_features).mean()
print('Test AUC: {:.2f}'.format(score * 100))

accuracy = precision_at_k(model, test, user_features=user_features, item_features=item_features, k=10).mean()
print('Test precision: {:.2f}'.format(accuracy * 100))

recall = recall_at_k(model, test, user_features=user_features, item_features=item_features, k=10).mean()
print('Test recall: {:.2f}'.format(recall * 100))

rank = reciprocal_rank(model, test, user_features=user_features, item_features=item_features).mean()
print('Test rank: {:.2f}'.format(rank * 100))

Test AUC: 71.82
Test precision: 5.00
Test recall: 9.00
Test rank: 14.93


In [None]:
for i in range(n_users):
    scores = model.predict(i, np.arange(n_items))
    top_items = course_df['name'][np.argsort(-scores)]
    print('User {}'.format(i))
    print("Top Items: ")
    print(top_items.head(5).to_string(index=False))
    print("")


User 0
Top Items: 
Data Structures and Algorithms Specialization (...
8 Best Computer Science Courses for Beginners t...
Course Unit BSCS1003 Data Structures and Algori...
                           Best SQL Courses [2023]
       Introduction to Computer Science (CMPT 141)

User 1
Top Items: 
Data Structures and Algorithms Specialization (...
8 Best Computer Science Courses for Beginners t...
                           Best SQL Courses [2023]
       Introduction to Computer Science (CMPT 141)
Course Unit BSCS1003 Data Structures and Algori...

User 2
Top Items: 
Intro to Object Oriented Programming - Crash Co...
         The best horticulture courses for 2022/23
7 Best Free Object-Oriented Programming Online ...
Object-oriented programming and algorithms - St...
Best Aviation Courses for Filipino Students - A...

User 3
Top Items: 
Course Unit BSCS1003 Data Structures and Algori...
Data Structures and Algorithms Specialization (...
8 Best Computer Science Courses for Beginners t...
   

#### Let's start saving the model so we can use it later on

In [None]:
import pickle

# Save the dataset
with open('data/dataset.pickle', 'wb') as f:
    pickle.dump(dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
    

with open('data/model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'dataset' is not defined

In [None]:
# Let's try opening the saved model and see if it works
from typing import cast
with open('data/model.pickle', 'rb') as f:
    loaded_model = cast(LightFM, pickle.load(f))
    scores = loaded_model.predict(0, np.arange(n_items))
    top_items = course_df['name'][np.argsort(-scores)]
    print(top_items.head(10))

11         Top Java Courses Online - Updated [May 2023]
4                                             Learn SQL
17    CS50: Introduction to Computer Science | Harva...
25                     Introduction to Computer Science
5     Learn SQL with Online Courses, Classes, & Lessons
19              CS50's Introduction to Computer Science
7       Introduction to SQL Course | Get Started in SQL
9                  Study Computing - Study In Australia
28                          CS 101 - Intro to Computers
31    Introduction to Computer Programming from Cour...
Name: name, dtype: object
