## ***import library***

In [44]:
import os
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster

from collections import Counter
from gensim.models.phrases import Phrases, Phraser

import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt', force=True)
nltk.download('wordnet', force=True)
nltk.download('stopwords', force=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')


print('Dependencies Imported')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jehad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jehad\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jehad\AppData\Roaming\nltk_data...


Dependencies Imported


[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jehad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
# import nltk
# nltk.download('omw-1.4')
# nltk.download('wordnet')
# nltk.download('wordnet2022')

# !cp -rf /usr/share/nltk_data/corpora/wordnet2022 /usr/share/nltk_data/corpora/wordnet

In [46]:
data = pd.read_csv('data\\coursera\\coursera_processed_data.csv')

In [None]:
training_data = data.copy()
data['tags'] = training_data['Course Name'] + ' ' + training_data['Course Description'] + ' ' + training_data['Skills']
training_data = data[['Course Name', 'tags']]

In [48]:
training_data.head()

Unnamed: 0,Course Name,tags
0,Write A Feature Length Screenplay For Film Or ...,Write A Feature Length Screenplay For Film Or ...
1,Business Strategy: Business Model Canvas Analy...,Business Strategy: Business Model Canvas Analy...
2,Silicon Thin Film Solar Cells,Silicon Thin Film Solar Cells This course cons...
3,Finance Managers,"Finance Managers When comes numbers, meets eye..."
4,Retrieve Data using Single-Table SQL Queries,Retrieve Data using Single-Table SQL Queries I...


In [49]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(training_data['tags'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (3424, 5000)


In [50]:
n_components = 100
svd = TruncatedSVD(n_components=n_components, random_state=42)
tfidf_matrix = svd.fit_transform(tfidf_matrix)

print("Reduced TF-IDF matrix shape:", tfidf_matrix.shape)

Reduced TF-IDF matrix shape: (3424, 100)


In [51]:
similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix[0][1])

0.06939888492365429


In [52]:
def normalize_rating(rating_str):
    """
    Normalize the course rating to a 0-1 scale.
    """
    try:
        return (float(rating_str) - 0) / (5 - 0)  # Normalize to 0-1
    except ValueError:
        return 0  


In [81]:
def get_recommendations(course_id, data, similarity_matrix, top_n=3, rating_weight=0.05):
    """
    @param course_id 
    Get top N course recommendations based on similarity to the given course name.
    """
    course_id = data[data['ID'] == course_id]
    course_idx = course_id.index[0]
    similarity_scores = list(enumerate(similarity_matrix[course_idx]))
    
    recommendations = []
    for idx, similarity_score in sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]:
        course_data = data.iloc[idx]
        normalized_rating = normalize_rating(course_data.get('Course Rating', '0'))

        recommendations.append({
            "course_id": course_data["ID"],
            "course_name": course_data['Course Name'],
            "course_url": course_data.get('Course URL', ''),
            "rating": course_data['Course Rating'],
            "institution": course_data.get('University', 'Unknown'),
            "difficulty_level": course_data.get('Difficulty Level', 'Unknown'),
            "similarity": similarity_score,
            "final_score": similarity_score * (1 - rating_weight) + normalized_rating * rating_weight 
        })

    return sorted(recommendations, key=lambda x: x['final_score'], reverse=True)

In [65]:
temp = get_recommendations(1, data, similarity_matrix)


In [67]:
temp[0]['course_id']

1

In [94]:
def get_recommendations_from_list_of_courses(courses_id, data, similarity_matrix, top_n=5):
    recommended = {}
    for course_id in courses_id:
        courses = get_recommendations(course_id=course_id,similarity_matrix= similarity_matrix, data= data)
        for course in courses:
            if(course['course_id'] in recommended):
                recommended[course['course_id']] += course['similarity']
            else:
                recommended[course['course_id']] = course['similarity']
    recommended = sorted(recommended.items(), key=lambda item: item[1], reverse=True)
    return recommended[0: top_n]
    


In [83]:
get_recommendations_from_list_of_courses([24, 35, 28], data, similarity_matrix)

[(407, 0.9960503751081485),
 (1448, 0.9603102360697856),
 (527, 0.9503480196074857),
 (2331, 0.9486944271440074),
 (193, 0.942407558233371)]

In [93]:
data[data['ID'] == 193]

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills,ID,tags
192,AWS Elastic Beanstalk:Deploy Python(Flask) Web...,Coursera Project Network,Beginner,5.0,https://www.coursera.org/learn/python-aws-elas...,"In 1-hour long project-based course, learn cre...","['Python', 'Programming', 'server', 'log', 'c'...",193,AWS Elastic Beanstalk:Deploy Python(Flask) Web...


In [None]:
pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))