# **Content-based Course Recommender System Using User Profile and Course Genres**


### Importing Libraries and Loading the dataset

In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

##### Course Genres


In [7]:
course_genre_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_genre.csv"
course_genres_df = pd.read_csv(course_genre_url)
course_genres_df.head()

Unnamed: 0,COURSE_ID,TITLE,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,ML0201EN,robots are coming build iot apps with watson ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,ML0122EN,accelerating deep learning with gpu,0,1,0,0,0,1,0,1,0,0,0,0,0,0
2,GPXX0ZG0EN,consuming restful services using the reactive ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,RP0105EN,analyzing big data in r using apache spark,1,0,0,1,0,0,0,0,1,0,1,0,0,0
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [8]:
course_genres_df.shape

(307, 16)

First 2 columns are COURSE_ID AND TITLE respectively. So, we have 14 features.

##### Users Profile


In [11]:
profile_genre_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/user_profile.csv"
profile_df = pd.read_csv(profile_genre_url)
print("Users Profile Shape:" , profile_df.shape)
profile_df.head()

Users Profile Shape: (33901, 15)


Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,2,52.0,14.0,6.0,43.0,3.0,33.0,0.0,29.0,41.0,2.0,18.0,34.0,9.0,6.0
1,4,40.0,2.0,4.0,28.0,0.0,14.0,0.0,20.0,24.0,0.0,6.0,6.0,0.0,2.0
2,5,24.0,8.0,18.0,24.0,0.0,30.0,0.0,22.0,14.0,2.0,14.0,26.0,4.0,6.0
3,7,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0


In [12]:
users_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-ML0321EN-Coursera/labs/v2/module_3/ratings.csv"
users_df = pd.read_csv(users_url)
users_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,5
1,1342067,CL0101EN,3
2,1990814,ML0120ENv3,5
3,380098,BD0211EN,5
4,779563,DS0101EN,3


In [13]:
users = set(users_df["user"])
len(users)

33901

In [16]:
all_courses = set(course_genres_df['COURSE_ID'])
len(all_courses)

307

### Generating course recommendations based on User profile for one user


In [18]:
# Let's say the user_id is 4
user_id = 4
profile_df[profile_df["user"] == user_id]

Unnamed: 0,user,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
1,4,40.0,2.0,4.0,28.0,0.0,14.0,0.0,20.0,24.0,0.0,6.0,6.0,0.0,2.0


In [19]:
user_vector = profile_df[profile_df["user"] == user_id].iloc[0, 1:].values
user_vector

array([40.,  2.,  4., 28.,  0., 14.,  0., 20., 24.,  0.,  6.,  6.,  0.,
        2.])

In [20]:
enrolled_courses = users_df[users_df['user'] == user_id]['item'].to_list()
len(enrolled_courses)

44

In [21]:
unknown_courses = all_courses.difference(enrolled_courses)
len(unknown_courses)

263

In [22]:
unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
unknown_course_df

Unnamed: 0,COURSE_ID,TITLE,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,ML0201EN,robots are coming build iot apps with watson ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,ML0122EN,accelerating deep learning with gpu,0,1,0,0,0,1,0,1,0,0,0,0,0,0
2,GPXX0ZG0EN,consuming restful services using the reactive ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,0,0,0,0,1,0,0,0,0,0,0,1,0,0
5,CNSC02EN,cloud native security conference data security,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,excourse89,javascript jquery and json,0,0,0,0,0,0,0,0,0,0,0,1,1,0
303,excourse90,programming foundations with javascript html ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
304,excourse91,front end web development with react,0,0,0,0,0,0,0,0,0,0,0,0,1,0
305,excourse92,introduction to web development,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [23]:
unknown_course_ids = unknown_course_df['COURSE_ID'].values

In [24]:
course_matrix = unknown_course_df.iloc[:, 2:].values
print(course_matrix.shape)
course_matrix

(263, 14)


array([[0, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [25]:
recommendation_scores = np.dot(unknown_course_df.iloc[:, 2:].values, user_vector)
recommendation_scores

array([ 6., 36.,  6.,  6.,  4., 26.,  0.,  0., 40.,  6., 44., 22.,  2.,
       14.,  6.,  6., 64., 14., 46., 14.,  6., 24., 14., 10., 10.,  6.,
        6.,  0., 46., 62., 10.,  6., 24., 38., 28., 28., 24., 42., 14.,
       64., 28., 64., 10., 34., 58., 22., 14., 14.,  4., 34., 14., 28.,
       28., 34., 14.,  6., 40.,  4.,  4., 16., 30., 14., 40., 40.,  8.,
        8., 40.,  6., 40., 52., 28., 40.,  6.,  0., 14.,  6., 44., 12.,
        6., 22.,  6.,  6.,  6.,  6., 46., 14.,  6.,  6., 14.,  6., 40.,
        4., 10., 14.,  6., 28.,  6., 34.,  0.,  6., 20., 20., 20.,  6.,
        6.,  6.,  6., 34.,  6.,  6.,  6., 14.,  4.,  6., 34.,  0., 14.,
        6.,  0., 48., 36.,  0., 16.,  6.,  4., 30.,  6.,  6.,  6.,  6.,
       14.,  6.,  6., 64., 64.,  0.,  6.,  6., 14., 28.,  6.,  6.,  6.,
        4., 14., 38.,  4., 72., 44.,  0., 28., 34.,  6.,  6.,  6.,  6.,
       30., 20.,  0., 14., 26., 46., 48.,  6., 10.,  4., 40.,  6.,  6.,
       14., 40., 68., 64., 60., 64., 60., 40.,  2., 28., 64., 40

In [26]:
courses = []    # List to store recommended course IDs
scores = []     # List to store recommendation scores
score_threshold = 10.0
for i in range(0, len(unknown_courses)):
    score = recommendation_scores[i]

            # Only keep the courses with high recommendation score
    if score >= score_threshold:
        courses.append(unknown_course_ids[i])
        scores.append(recommendation_scores[i])

In [27]:
user_dict = {}

user_dict['COURSE_ID'] = courses
user_dict['SCORE'] = scores

# Create a DataFrame named res_df using the res_dict dictionary, specifying the column order as ['USER', 'COURSE_ID', 'SCORE']
user_df = pd.DataFrame(user_dict, columns=['COURSE_ID', 'SCORE'])

# Save the res_df DataFrame to a CSV file named "profile_rs_results.csv" without including the index

# Output the res_df DataFrame
print(f"User: {user_id}")
user_df


User: 4


Unnamed: 0,COURSE_ID,SCORE
0,ML0122EN,36.0
1,DX0106EN,26.0
2,GPXX06RFEN,40.0
3,CC0271EN,44.0
4,DX0108EN,22.0
...,...,...
156,excourse79,14.0
157,excourse82,34.0
158,excourse83,34.0
159,excourse84,34.0


### Generating course recommendations based on User profile for all users


In [29]:
def generate_recommendation_scores():
  
    users_list = []      # List to store user IDs
    courses_list = []    # List to store recommended course IDs
    scores_list = []     # List to store recommendation scores

    # Iterate over each user ID in the test_user_ids list
    

    for user_id in users:
        # Get the user profile data for the current user
        user_vector = profile_df[profile_df["user"] == user_id].iloc[0, 1:].values
        enrolled_courses = users_df[users_df['user'] == user_id]['item'].to_list()
        unknown_courses = all_courses.difference(enrolled_courses)

        # Filter the course_genres_df to include only unknown courses
        unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
        unknown_course_ids = unknown_course_df['COURSE_ID'].values

        # Calculate the recommendation scores using dot product
        recommendation_scores = np.dot(unknown_course_df.iloc[:, 2:].values, user_vector)

        # Append the results into the users, courses, and scores list
        for i in range(0, len(unknown_courses)):
            score = recommendation_scores[i]

            # Only keep the courses with high recommendation score
            if score >= score_threshold:
                users_list.append(user_id)
                courses_list.append(unknown_course_ids[i])
                scores_list.append(recommendation_scores[i])

    return users_list, courses_list, scores_list

In [30]:
users, courses, scores = generate_recommendation_scores()

# Create an empty dictionary named res_dict to store the results of the recommendation process
res_dict = {}

# Store the lists of users, courses, and scores into the res_dict dictionary with corresponding keys
res_dict['USER'] = users
res_dict['COURSE_ID'] = courses
res_dict['SCORE'] = scores

# Create a DataFrame named res_df using the res_dict dictionary, specifying the column order as ['USER', 'COURSE_ID', 'SCORE']
res_df = pd.DataFrame(res_dict, columns=['USER', 'COURSE_ID', 'SCORE'])

# Save the res_df DataFrame to a CSV file named "profile_rs_results.csv" without including the index

# Output the res_df DataFrame
res_df


Unnamed: 0,USER,COURSE_ID,SCORE
0,2,ML0201EN,43.0
1,2,GPXX0ZG0EN,43.0
2,2,GPXX0Z2PEN,37.0
3,2,DX0106EN,47.0
4,2,GPXX06RFEN,52.0
...,...,...,...
1500419,1835005,TMP107,12.0
1500420,1835005,ML0101EN,12.0
1500421,1835005,excourse21,12.0
1500422,1835005,excourse22,12.0


In [31]:
res_df.to_csv("profile_rs_results.csv", index=False)
