# **Content-based Course Recommender System Using User Profile and Course Genres**


In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
# also set a random state
rs = 123

### GeneratING course recommendations based on user profile and course genre vectors


First, we will load a user's profile dataframe and a course genre dataframe:


In [23]:
course_genre_url = "data/course_genre.csv"
course_genres_df = pd.read_csv(course_genre_url)

In [24]:
course_genres_df.head()

Unnamed: 0,COURSE_ID,TITLE,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,ML0201EN,robots are coming build iot apps with watson ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,ML0122EN,accelerating deep learning with gpu,0,1,0,0,0,1,0,1,0,0,0,0,0,0
2,GPXX0ZG0EN,consuming restful services using the reactive ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,RP0105EN,analyzing big data in r using apache spark,1,0,0,1,0,0,0,0,1,0,1,0,0,0
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [25]:
profile_genre_url = "data/user_profile.csv"
profile_df = pd.read_csv(profile_genre_url)

In [26]:
profile_df.head()

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,2,52.0,14.0,6.0,43.0,3.0,33.0,0.0,29.0,41.0,2.0,18.0,34.0,9.0,6.0
1,4,40.0,2.0,4.0,28.0,0.0,14.0,0.0,20.0,24.0,0.0,6.0,6.0,0.0,2.0
2,5,24.0,8.0,18.0,24.0,0.0,30.0,0.0,22.0,14.0,2.0,14.0,26.0,4.0,6.0
3,7,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0


The profile dataframe contains the course interests for each user, for example, user 8 is very interested in R, data analysis, database, and big data:


In [27]:
profile_df[profile_df['user'] == 8]

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0


Next, let's load a test dataset, containing test users to whom we want to make course recommendations:


In [28]:
test_users_url = "data/ratings.csv"
test_users_df = pd.read_csv(test_users_url)

In [29]:
test_users_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,5
1,1342067,CL0101EN,3
2,1990814,ML0120ENv3,5
3,380098,BD0211EN,5
4,779563,DS0101EN,3


Let's look at how many test users we have in the dataset.


In [30]:
# Group the test users DataFrame by the 'user' column and find the maximum value for each group,
# then reset the index and drop the old index to obtain a DataFrame with unique user IDs
test_users = test_users_df.groupby(['user']).max().reset_index(drop=False)

# Extract the 'user' column from the test_users DataFrame and convert it to a list of user IDs
test_user_ids = test_users['user'].to_list()

# Print the total number of test users by obtaining the length of the test_user_ids list
print(f"Total numbers of test users {len(test_user_ids)}")


Total numbers of test users 33901


Then for each test user in the test dataset, you need to first find out which courses are unknown/unselected to them. For example, suppose we have a user `1078030` with profile:


In [31]:
test_user_profile = profile_df[profile_df['user'] == 1078030]
test_user_profile

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
18204,1078030,0.0,12.0,0.0,9.0,0.0,12.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Now let's get the test user vector by excluding the `user` column
test_user_vector = test_user_profile.iloc[0, 1:].values
test_user_vector

array([ 0., 12.,  0.,  9.,  0., 12.,  0.,  6.,  0.,  0.,  0.,  0.,  0.,
        0.])

We can first find their enrolled courses in `test_users_df`:


In [33]:
enrolled_courses = test_users_df[test_users_df['user'] == 1078030]['item'].to_list()
enrolled_courses = set(enrolled_courses)

In [34]:
enrolled_courses

{'DA0101EN',
 'DV0101EN',
 'ML0101ENv3',
 'ML0115EN',
 'ML0120ENv2',
 'ML0122ENv1',
 'PY0101EN',
 'ST0101EN'}

We then print the entire course list:


In [37]:
all_courses = set(course_genres_df['COURSE_ID'].values)
#all_courses

Then we can use all courses to subtract the enrolled courses to get a set of all unknown courses for user `1078030`, and we want to find potential interested courses hidden in the unknown course list.


In [38]:
unknown_courses = all_courses.difference(enrolled_courses)
#unknown_courses

We can get the genre vectors for those unknown courses as well:


In [39]:
unknown_course_genres = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
# Now let's get the course matrix by excluding `COURSE_ID` and `TITLE` columns:
course_matrix = unknown_course_genres.iloc[:, 2:].values
course_matrix

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [40]:
score = np.dot(course_matrix[1], test_user_vector)
score

30.0

Next, let's calculate the recommendation scores of all courses for all the 1000 test users. 


In [41]:
# Reload the test users dataset from the specified URL using pandas and store it in test_users_df
test_users_df = pd.read_csv(test_users_url)

# Reload the user profiles dataset from the specified URL containing user profiles and their associated genres using pandas and store it in profile_df
profile_df = pd.read_csv(profile_genre_url)

# Reload the course genres dataset from the specified URL containing course genres using pandas and store it in course_genres_df
course_genres_df = pd.read_csv(course_genre_url)

# Create an empty dictionary to store the results of the recommendation process
res_dict = {}


We only want to recommend courses with very high scores so we may set a score threshold to filter out those courses with low scores.


In [51]:
# Only keep the score larger than the recommendation threshold
# The threshold can be fine-tuned to adjust the size of generated recommendations
score_threshold = 20.0

We defined a function called `generate_recommendation_scores()` to compute the recommendation scores of all the unknown courses for all test users.


In [52]:
def generate_recommendation_scores():
    """
    Generate recommendation scores for users and courses.

    Returns:
    users (list): List of user IDs.
    courses (list): List of recommended course IDs.
    scores (list): List of recommendation scores.
    """

    users = []      # List to store user IDs
    courses = []    # List to store recommended course IDs
    scores = []     # List to store recommendation scores

    # Iterate over each user ID in the test_user_ids list
    for user_id in test_user_ids:
        # Get the user profile data for the current user
        test_user_profile = profile_df[profile_df['user'] == user_id]

        # Get the user vector for the current user id (replace with your method to obtain the user vector)
        test_user_vector = test_user_profile.iloc[0, 1:].values

        # Get the known course ids for the current user
        enrolled_courses = test_users_df[test_users_df['user'] == user_id]['item'].to_list()

        # Calculate the unknown course ids
        unknown_courses = all_courses.difference(enrolled_courses)

        # Filter the course_genres_df to include only unknown courses
        unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
        unknown_course_ids = unknown_course_df['COURSE_ID'].values

        # Calculate the recommendation scores using dot product
        recommendation_scores = np.dot(unknown_course_df.iloc[:, 2:].values, test_user_vector)

        # Append the results into the users, courses, and scores list
        for i in range(0, len(unknown_course_ids)):
            score = recommendation_scores[i]

            # Only keep the courses with high recommendation score
            if score >= score_threshold:
                users.append(user_id)
                courses.append(unknown_course_ids[i])
                scores.append(recommendation_scores[i])

    return users, courses, scores
 


In [53]:
# Call the generate_recommendation_scores function to obtain recommendation scores for users and courses,
# and assign the returned lists to variables users, courses, and scores
users, courses, scores = generate_recommendation_scores()

# Create an empty dictionary named res_dict to store the results of the recommendation process
res_dict = {}

# Store the lists of users, courses, and scores into the res_dict dictionary with corresponding keys
res_dict['USER'] = users
res_dict['COURSE_ID'] = courses
res_dict['SCORE'] = scores

# Create a DataFrame named res_df using the res_dict dictionary, specifying the column order as ['USER', 'COURSE_ID', 'SCORE']
res_df = pd.DataFrame(res_dict, columns=['USER', 'COURSE_ID', 'SCORE'])

# Save the res_df DataFrame to a CSV file named "profile_rs_results.csv" without including the index
res_df.to_csv("profile_rs_results.csv", index=False)

# Output the res_df DataFrame
res_df


Unnamed: 0,USER,COURSE_ID,SCORE
0,2,ML0201EN,43.0
1,2,GPXX0ZG0EN,43.0
2,2,GPXX0Z2PEN,37.0
3,2,DX0106EN,47.0
4,2,GPXX06RFEN,52.0
...,...,...,...
479126,2102680,GPXX04P5EN,23.0
479127,2102680,ML0101EN,29.0
479128,2102680,excourse21,29.0
479129,2102680,excourse22,29.0


In [54]:
res_df = pd.read_csv("profile_rs_results.csv")

In [55]:
users = list(set(res_df['USER'].unique()))
total_new_courses = 0
for user in users:
    total_new_courses += len(res_df[res_df['USER']==user]['COURSE_ID'].unique())

num_of_users = len(set(res_df['USER'].unique()))
average_new_courses = total_new_courses/num_of_users
average_new_courses, num_of_users, total_new_courses

(28.943518182916517, 16554, 479131)

In [56]:
# Let's first load the course content and BoW dataset
course_url = "data/course_processed.csv"
course_df = pd.read_csv(course_url)

df = pd.DataFrame(res_df['COURSE_ID'].value_counts()[:10]).reset_index()

df.rename(columns={'count':'Recommendations'}, inplace =True)
merged_df= pd.merge(df, course_df[['COURSE_ID', 'TITLE']], on='COURSE_ID', how='left')
merged_df.sort_values(by='Recommendations', ascending=False, inplace=True)
merged_df[['TITLE','Recommendations']]

Unnamed: 0,TITLE,Recommendations
0,foundations for big data analysis with sql,9138
1,analyzing big data with sql,9138
2,getting started with the data apache spark ma...,8954
3,analyzing big data in r using apache spark,8769
4,spark overview for scala analytics,7970
5,cloud computing applications part 2 big data...,7853
6,applied machine learning in python,7671
7,introduction to data science in python,7671
8,accelerating deep learning with gpu,7633
9,spark fundamentals ii,7203


    ### Computing the full user_profile

In [13]:
# MY USER PROFILE MODEL
#----------------------------------
def user_profile_model(users):
    u_weights = []
    for user in users:
        user_courses = users_df[users_df['user']==user]['item'].unique()
        all_courses = courses_df['COURSE_ID'].unique()
        u0 = np.zeros((1, len(all_courses)), dtype='int32')
        for i, course in enumerate(all_courses):
            if course in user_courses:
                y = users_df[(users_df['user'] == user) & (users_df['item'] == course)]['rating'].values[0]
                u0[0, i] = y
            else:
                u0[0, i] = 0
        genres = [x for x in courses_df.columns if x not in ['COURSE_ID', 'TITLE']]
        C = courses_df[genres].to_numpy()
        #print(f"User profile vector shape {u0.shape} and course genre matrix shape {C.shape}")
        u0_weights = np.matmul(u0, C)
        #print(user, u0_weights)
        u_weights.append(u0_weights.reshape(1, len(genres)))

    u_weights = tuple(u_weights)

    weights = np.concatenate(u_weights, axis=0)
    profiles_df = pd.DataFrame(weights, columns=genres)
    profiles_df.insert(0, 'user', users)

    #Saving the profiles to a CSV file
    csv_file = 'my_user_profile.csv'
    profiles_df.to_csv(csv_file)
    print(f"User profiles saved to {csv_file}")

    return profiles_df

users_df = pd.read_csv('data/ratings.csv')

courses_df = pd.read_csv('data/course_genre.csv')
users = users_df['user'].unique()
output = user_profile_model(users)
output.head()
#--------------------------------------

User profiles saved to my_user_profile.csv


Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,1889878,24,13,22,19,9,23,0,9,11,4,12,17,0,13
1,1342067,17,0,7,5,0,0,0,0,13,0,0,0,0,0
2,1990814,45,16,13,39,13,27,0,27,41,0,7,18,0,0
3,380098,12,21,14,26,16,52,0,32,5,0,16,14,0,3
4,779563,3,11,0,12,0,0,0,7,3,0,0,5,0,5


## Authors


[Yan Luo](https://www.linkedin.com/in/yan-luo-96288783/)


### Other Contributors


```toggle## Change Log
```


```toggle|Date (YYYY-MM-DD)|Version|Changed By|Change Description|
```
```toggle|-|-|-|-|
```
```toggle|2021-10-25|1.0|Yan|Created the initial version|
```
