### Recommendations Engine Light FM Version
This is the main file for the recommendations engine that uses LightFM to predict recommendations for the user.


Note: Try and use this on a linux environment to make sure that everything is working properly.

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from lightfm.data import  Dataset



In [2]:
# Load the course datasets
course_df = pd.read_csv('data/courses.csv')
course_df.drop(columns=['createdAt', 'updatedAt', 'deletedAt', 'description'], axis=1, inplace=True)
course_interaction_df = pd.read_csv('data/course_interactions.csv')
course_interaction_df.drop(columns=['id','createdAt', 'updatedAt', 'deletedAt'], axis=1, inplace=True)

In [3]:
# # Group the course interactions by user and course and add a interactions column
# course_interaction_df = course_interaction_df.groupby(['userId', 'courseId']).size().reset_index(name='interactions')
# course_interaction_df

#### Create a dataset for the recommendations engine

In [4]:
# Create a lightfm dataset
ds = Dataset()


# Fit the dataset
ds.fit(
    users=course_interaction_df['userId'],
    items=course_interaction_df['courseId'],
)

ds.fit_partial(
    items=course_df['id'],
    item_features=course_df['name'],
)

In [5]:
# Check the current shape of the dataset
print('Num users: {}, num_items {}.'.format(ds.interactions_shape()[0], ds.interactions_shape()[1]))

Num users: 8, num_items 3255.


In [6]:
item_features = ds.build_item_features(((x['id'],[x['name']]) for _, x in course_df.iterrows()))
(interactions, weights) = ds.build_interactions(((x['userId'], x['courseId']) for _, x in course_interaction_df.iterrows()))


# Split the data into training and test sets
from lightfm.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.5, random_state=np.random.RandomState(42))
print('Num users: {}, num_items {}.'.format(train.shape[0], train.shape[1]))
print('Num users: {}, num_items {}.'.format(test.shape[0], test.shape[1]))

Num users: 8, num_items 3255.
Num users: 8, num_items 3255.


In [7]:
# Create a loop to test the model with different hyperparameters
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

params = {
    'no_components': [10, 20, 40, 80, 160],
    'learning_schedule': ['adagrad', 'adadelta'],
    'loss': ['logistic', 'bpr', 'warp', 'warp-kos'],
    'learning_rate': [0.05, 0.1, 0.2, 0.5, 1.0],
    'item_alpha': [0.0, 0.0001, 0.0005, 0.001, 0.005],
    'user_alpha': [0.0, 0.0001, 0.0005, 0.001, 0.005],
    'max_sampled': [5, 10, 15, 20, 25],
    'num_epochs': [5, 10, 15, 20, 25],
    'random_state': [42]
}

from itertools import product

param_grid = list(product(*params.values()))
print(len(param_grid))

# Create a function to test the model with different hyperparameters
def test_model(params):
    no_components, learning_schedule, loss, learning_rate, item_alpha, user_alpha, max_sampled, num_epochs, random_state = params
    model = LightFM(no_components=no_components,
                    learning_schedule=learning_schedule,
                    loss=loss,
                    learning_rate=learning_rate,
                    item_alpha=item_alpha,
                    user_alpha=user_alpha,
                    max_sampled=max_sampled,
                    random_state=random_state)
    model.fit(train, epochs=num_epochs, num_threads=4)
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10).mean()
    return train_precision, test_precision

# Test the model with different hyperparameters
# results = []
# for i in range(len(param_grid)):
#     print('Iteration {}/{}'.format(i+1, len(param_grid)))
#     results.append(test_model(param_grid[i]))

# # Print the results
# results_df = pd.DataFrame(results, columns=['train_precision', 'test_precision'])
# results_df['params'] = param_grid
# results_df.sort_values('test_precision', ascending=False, inplace=True)

# # Print the best results
# print(results_df.head(10))

125000


In [8]:
# Save the results to a csv
# results_df.to_csv('data/results.csv', index=False)

In [9]:
n_users, n_items = interactions.shape
print('Num users: {}, num_items {}.'.format(n_users, n_items))

Num users: 8, num_items 3255.


In [27]:
model = LightFM(no_components=80,
                loss='warp',
                learning_rate=0.001,
                item_alpha=0.0001,
                random_state=42)

model.fit(train, epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x12f24cdc0>

In [28]:
for i in range(n_users):
    scores = model.predict(i, np.arange(n_items))
    top_items = course_df['name'][np.argsort(-scores)]
    print('User {}'.format(i))
    print("Top Items: ")
    print(top_items.head(10).to_string(index=False))
    print("")

    

User 0
Top Items: 
                 LeetCode's Interview Crash Course
8 Best Computer Science Courses for Beginners t...
       Introduction to Computer Science (CMPT 141)
              Study Computing - Study In Australia
Free Java Course Online for Beginners | Java Pr...
Introduction to Computer Science ICS3U Online C...
                                        Learn Java
    Top 6 Online SQL Courses for Beginners in 2023
               Introduction to Computing Science I
      Top Java Courses Online - Updated [May 2023]

User 1
Top Items: 
                 LeetCode's Interview Crash Course
       Introduction to Computer Science (CMPT 141)
8 Best Computer Science Courses for Beginners t...
              Study Computing - Study In Australia
               Introduction to Computing Science I
                                        Learn Java
Free Java Course Online for Beginners | Java Pr...
                  Introduction to Computer Science
    Top 6 Online SQL Courses for Beginners 

#### Let's start saving the model so we can use it later on

In [12]:
import pickle

# Save the dataset
with open('data/dataset.pickle', 'wb') as f:
    pickle.dump(dataset, f, protocol=pickle.HIGHEST_PROTOCOL)
    

with open('data/model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'dataset' is not defined

In [None]:
# Let's try opening the saved model and see if it works
from typing import cast
with open('data/model.pickle', 'rb') as f:
    loaded_model = cast(LightFM, pickle.load(f))
    scores = loaded_model.predict(0, np.arange(n_items))
    top_items = course_df['name'][np.argsort(-scores)]
    print(top_items.head(10))

11         Top Java Courses Online - Updated [May 2023]
4                                             Learn SQL
17    CS50: Introduction to Computer Science | Harva...
25                     Introduction to Computer Science
5     Learn SQL with Online Courses, Classes, & Lessons
19              CS50's Introduction to Computer Science
7       Introduction to SQL Course | Get Started in SQL
9                  Study Computing - Study In Australia
28                          CS 101 - Intro to Computers
31    Introduction to Computer Programming from Cour...
Name: name, dtype: object
