In [1]:
import pandas as pd
df = pd.read_csv("data/Coursera.csv")
df.head()

Unnamed: 0,Subject,Title,Institution,Learning Product,Level,Duration,Gained Skills,Rate,Reviews
0,Business,Business Analysis & Process Management,Coursera Project Network,Guided Project,Beginner,Less Than 2 Hours,"Process Analysis, Business Process, Business A...",4.4,6100
1,Business,Getting Started with Microsoft Excel,Coursera Project Network,Guided Project,Intermediate,Less Than 2 Hours,"Microsoft Excel, Excel Formulas, Spreadsheet S...",4.6,11000
2,Business,Financial Markets,Yale University,Course,Beginner,1 - 3 Months,"Investment Banking, Risk Management, Financial...",4.8,30000
3,Business,Investment Risk Management,Coursera Project Network,Guided Project,Intermediate,Less Than 2 Hours,"Investment Management, Risk Management, Financ...",4.4,1800
4,Business,Food & Beverage Management,Università Bocconi,Course,Mixed,1 - 3 Months,"Food and Beverage, Hospitality, Restaurant Man...",4.8,4800


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
df = pd.read_csv("data/Coursera.csv")

# Fill missing values
df['Gained Skills'] = df['Gained Skills'].fillna("")
df['Subject'] = df['Subject'].fillna("")

# Combine text features for content-based filtering
df['text_features'] = df['Title'] + " " + df['Subject'] + " " + df['Gained Skills']

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['text_features'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (3404, 2786)


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between all courses
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [4]:
def recommend_courses(title, df, cosine_sim, top_n=5):
    # Find index of the course
    idx = df[df['Title'] == title].index[0]
    
    # Get similarity scores for this course
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort courses by similarity (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top_n most similar courses (skip the first one = itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get course indices
    course_indices = [i[0] for i in sim_scores]
    
    return df[['Title', 'Institution', 'Level', 'Rate']].iloc[course_indices]

In [5]:
print(recommend_courses("Financial Markets", df, cosine_sim, top_n=5))

                                                 Title  \
693                Marketing: Customer Needs and Wants   
294                               Mercados financieros   
286  Foundational Finance for Strategic Decision Ma...   
413                                 Marchés financiers   
563                                  Financial Markets   

                          Institution         Level  Rate  
693              IESE Business School         Mixed   4.5  
294                   Yale University      Beginner   4.7  
286            University of Michigan      Beginner   4.7  
413                   Yale University      Beginner   4.7  
563  Università di Napoli Federico II  Intermediate   4.8  


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Fit vectorizer only once on the dataset
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Gained Skills'].fillna(''))

# Step 2: Recommendation function
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input using the same vectorizer
    user_tfidf = vectorizer.transform([user_skills])
    
    # Compute similarity
    sim_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    
    # Get top_n recommendations
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['Title', 'Gained Skills']]

# Example
print(recommend_by_skills("Python, Data Science, Machine Learning", df, vectorizer, tfidf_matrix, top_n=5))

                                                  Title  \
2663                             Data Science Challenge   
2476        Unsupervised Algorithms in Machine Learning   
3341        Unsupervised Algorithms in Machine Learning   
1291  Create Machine Learning Models in Microsoft Azure   
3226          Recommender Systems with Machine Learning   

                                          Gained Skills  
2663  Applied Machine Learning, Jupyter, Machine Lea...  
2476  Unsupervised Learning, Dimensionality Reductio...  
3341  Unsupervised Learning, Dimensionality Reductio...  
1291  Unsupervised Learning, Scikit Learn (Machine L...  
3226  Data Manipulation, Applied Machine Learning, D...  


In [10]:
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input into vector
    user_vec = vectorizer.transform([user_skills])

    # Compute cosine similarity
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Get top indices
    top_indices = sim_scores.argsort()[-top_n:][::-1]

    # Return nice DataFrame
    return df.iloc[top_indices][['Title', 'Gained Skills']].reset_index(drop=True)


# Example
recommendations = recommend_by_skills(
    "Python, Data Science, Machine Learning",
    df,
    vectorizer,
    tfidf_matrix,
    top_n=5
)

print(recommendations)

                                               Title  \
0                             Data Science Challenge   
1        Unsupervised Algorithms in Machine Learning   
2        Unsupervised Algorithms in Machine Learning   
3  Create Machine Learning Models in Microsoft Azure   
4          Recommender Systems with Machine Learning   

                                       Gained Skills  
0  Applied Machine Learning, Jupyter, Machine Lea...  
1  Unsupervised Learning, Dimensionality Reductio...  
2  Unsupervised Learning, Dimensionality Reductio...  
3  Unsupervised Learning, Scikit Learn (Machine L...  
4  Data Manipulation, Applied Machine Learning, D...  


In [11]:
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input into vector
    user_vec = vectorizer.transform([user_skills])

    # Compute cosine similarity
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Get top indices
    top_indices = sim_scores.argsort()[::-1]

    # Select courses, drop duplicates by Title
    results = df.iloc[top_indices][['Title', 'Gained Skills']].drop_duplicates(subset=['Title']).head(top_n)

    return results.reset_index(drop=True)


# Example
recommendations = recommend_by_skills(
    "Python, Data Science, Machine Learning",
    df,
    vectorizer,
    tfidf_matrix,
    top_n=5
)

print(recommendations)

                                               Title  \
0                             Data Science Challenge   
1        Unsupervised Algorithms in Machine Learning   
2  Create Machine Learning Models in Microsoft Azure   
3          Recommender Systems with Machine Learning   
4                Developing AI Applications on Azure   

                                       Gained Skills  
0  Applied Machine Learning, Jupyter, Machine Lea...  
1  Unsupervised Learning, Dimensionality Reductio...  
2  Unsupervised Learning, Scikit Learn (Machine L...  
3  Data Manipulation, Applied Machine Learning, D...  
4  Artificial Intelligence and Machine Learning (...  


In [12]:
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input into vector
    user_vec = vectorizer.transform([user_skills])

    # Compute cosine similarity
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Get top indices sorted by similarity
    top_indices = sim_scores.argsort()[::-1]

    # Select courses with full info, drop duplicate titles
    results = (
        df.iloc[top_indices][['Title', 'Institution', 'Level', 'Rate', 'Gained Skills']]
        .drop_duplicates(subset=['Title'])
        .head(top_n)
    )

    return results.reset_index(drop=True)


# Example
recommendations = recommend_by_skills(
    "Python, Data Science, Machine Learning",
    df,
    vectorizer,
    tfidf_matrix,
    top_n=5
)

print(recommendations)

                                               Title  \
0                             Data Science Challenge   
1        Unsupervised Algorithms in Machine Learning   
2  Create Machine Learning Models in Microsoft Azure   
3          Recommender Systems with Machine Learning   
4                Developing AI Applications on Azure   

                      Institution         Level  Rate  \
0        Coursera Project Network  Intermediate   4.9   
1  University of Colorado Boulder  Intermediate   3.8   
2                       Microsoft  Intermediate   3.1   
3                           Packt  Intermediate   3.9   
4                      LearnQuest      Advanced   4.7   

                                       Gained Skills  
0  Applied Machine Learning, Jupyter, Machine Lea...  
1  Unsupervised Learning, Dimensionality Reductio...  
2  Unsupervised Learning, Scikit Learn (Machine L...  
3  Data Manipulation, Applied Machine Learning, D...  
4  Artificial Intelligence and Machine Learni

In [14]:
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    user_vec = vectorizer.transform([user_skills])
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    top_indices = sim_scores.argsort()[::-1]

    results = (
        df.iloc[top_indices][['Title', 'Institution', 'Level', 'Rate', 'Gained Skills']]
        .drop_duplicates(subset=['Title'])
        .head(top_n)
        .copy()
    )

    # Add similarity score (rounded for readability)
    results['Match Score'] = [round(sim_scores[i], 3) for i in top_indices[:len(results)]]

    return results.reset_index(drop=True)

recommendations = recommend_by_skills(
    "machine learning python", df, vectorizer, tfidf_matrix, top_n=5
)
print(recommendations)

                                               Title  \
0          Recommender Systems with Machine Learning   
1  Scikit-Learn to Solve Regression Machine Learn...   
2                 Нейронные сети и глубокое обучение   
3  Create Machine Learning Models in Microsoft Azure   
4        Unsupervised Algorithms in Machine Learning   

                      Institution         Level  Rate  \
0                           Packt  Intermediate   3.9   
1        Coursera Project Network      Beginner   4.7   
2                 DeepLearning.AI  Intermediate   4.8   
3                       Microsoft  Intermediate   3.1   
4  University of Colorado Boulder  Intermediate   3.8   

                                       Gained Skills  Match Score  
0  Data Manipulation, Applied Machine Learning, D...        0.641  
1  Scikit Learn (Machine Learning Library), Predi...        0.641  
2  Deep Learning, Artificial Neural Networks, Sup...        0.620  
3  Unsupervised Learning, Scikit Learn (Machine 

In [17]:
from tabulate import tabulate

recommendations = recommend_by_skills("machine learning python", df, vectorizer, tfidf_matrix)

print(tabulate(recommendations, headers="keys", tablefmt="fancy_grid"))

╒════╤════════════════════════════════════════════════════════════╤════════════════════════════════╤══════════════╤════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤═══════════════╕
│    │ Title                                                      │ Institution                    │ Level        │   Rate │ Gained Skills                                                                                                                                                                                                                                                        

In [16]:
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
def truncate_text(text, max_len=80):
    return text if len(text) <= max_len else text[:max_len] + "..."

recommendations = recommend_by_skills("machine learning python", df, vectorizer, tfidf_matrix)

# Apply truncation to Gained Skills
recommendations["Gained Skills"] = recommendations["Gained Skills"].apply(lambda x: truncate_text(str(x)))

print(tabulate(recommendations, headers="keys", tablefmt="fancy_grid", showindex=True))

╒════╤════════════════════════════════════════════════════════════╤════════════════════════════════╤══════════════╤════════╤═════════════════════════════════════════════════════════════════════════════════════╤═══════════════╕
│    │ Title                                                      │ Institution                    │ Level        │   Rate │ Gained Skills                                                                       │   Match Score │
╞════╪════════════════════════════════════════════════════════════╪════════════════════════════════╪══════════════╪════════╪═════════════════════════════════════════════════════════════════════════════════════╪═══════════════╡
│  0 │ Recommender Systems with Machine Learning                  │ Packt                          │ Intermediate │    3.9 │ Data Manipulation, Applied Machine Learning, Data Processing, Supervised Learnin... │         0.641 │
├────┼────────────────────────────────────────────────────────────┼─────────────────────────

In [19]:
recommendations["Match Score"] = (recommendations["Match Score"] * 100).round(1).astype(str) + "%"

In [20]:
print(tabulate(recommendations[['Title', 'Institution', 'Rate', 'Match Score']], 
               headers="keys", tablefmt="fancy_grid", showindex=True))

╒════╤════════════════════════════════════════════════════════════╤════════════════════════════════╤════════╤═══════════════╕
│    │ Title                                                      │ Institution                    │   Rate │ Match Score   │
╞════╪════════════════════════════════════════════════════════════╪════════════════════════════════╪════════╪═══════════════╡
│  0 │ Recommender Systems with Machine Learning                  │ Packt                          │    3.9 │ 64.1%         │
├────┼────────────────────────────────────────────────────────────┼────────────────────────────────┼────────┼───────────────┤
│  1 │ Scikit-Learn to Solve Regression Machine Learning Problems │ Coursera Project Network       │    4.7 │ 64.1%         │
├────┼────────────────────────────────────────────────────────────┼────────────────────────────────┼────────┼───────────────┤
│  2 │ Нейронные сети и глубокое обучение                         │ DeepLearning.AI                │    4.8 │ 62.0%   

In [21]:
recommendations.insert(0, "Rank", range(1, len(recommendations) + 1))

In [22]:
print(tabulate(
    recommendations[['Rank', 'Title', 'Institution', 'Rate', 'Match Score']],
    headers="keys", 
    tablefmt="fancy_grid", 
    showindex=False
))

╒════════╤════════════════════════════════════════════════════════════╤════════════════════════════════╤════════╤═══════════════╕
│   Rank │ Title                                                      │ Institution                    │   Rate │ Match Score   │
╞════════╪════════════════════════════════════════════════════════════╪════════════════════════════════╪════════╪═══════════════╡
│      1 │ Recommender Systems with Machine Learning                  │ Packt                          │    3.9 │ 64.1%         │
├────────┼────────────────────────────────────────────────────────────┼────────────────────────────────┼────────┼───────────────┤
│      2 │ Scikit-Learn to Solve Regression Machine Learning Problems │ Coursera Project Network       │    4.7 │ 64.1%         │
├────────┼────────────────────────────────────────────────────────────┼────────────────────────────────┼────────┼───────────────┤
│      3 │ Нейронные сети и глубокое обучение                         │ DeepLearning.AI   

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input into vector
    user_vec = vectorizer.transform([user_skills])
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Sort by similarity score
    top_indices = sim_scores.argsort()[::-1]

    # Get results
    results = (
        df.iloc[top_indices][['Title', 'Institution', 'Rate']]
        .drop_duplicates(subset=['Title'])
        .head(top_n)
        .copy()
    )

    # Add similarity score as percentage
    results['Match Score'] = [
        f"{round(sim_scores[i] * 100, 1)}%" for i in top_indices[:len(results)]
    ]

    # Add rank
    results.insert(0, "Rank", range(1, len(results) + 1))

    return results.reset_index(drop=True)
recs = recommend_by_skills("machine learning, regression", df, vectorizer, tfidf_matrix, top_n=5)
print(recs.to_string(index=False))

 Rank                                                     Title              Institution  Rate Match Score
    1                                Practical Machine Learning Johns Hopkins University   4.3       66.2%
    2                          Microsoft Azure Machine Learning                Microsoft   4.9       66.2%
    3 AI Workflow: Machine Learning, Visual Recognition and NLP                      IBM   4.3       66.1%
    4   Build Regression, Classification, and Clustering Models                CertNexus   4.5       66.1%
    5               Supervised Machine Learning: Classification                      IBM   4.8       65.1%


In [27]:
def recommend_by_skills(user_skills, df, vectorizer, tfidf_matrix, top_n=5):
    # Transform user input into vector
    user_vec = vectorizer.transform([user_skills])
    sim_scores = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Sort by similarity score (descending)
    top_indices = sim_scores.argsort()[::-1]

    # Only keep columns that exist in your df
    results = (
        df.iloc[top_indices][['Title', 'Institution', 'Rate']]  # removed URL
        .drop_duplicates(subset=['Title'])
        .head(top_n)
        .copy()
    )

    # Add similarity score
    results['Match Score'] = [
        f"{round(sim_scores[i] * 100, 1)}%" for i in top_indices[:len(results)]
    ]

    # Add ranking
    results.insert(0, "Rank", range(1, len(results) + 1))

    return results

In [28]:
from tabulate import tabulate

recs = recommend_by_skills("machine learning, regression", df, vectorizer, tfidf_matrix, top_n=5)
print(tabulate(recs, headers="keys", tablefmt="github", showindex=False))

|   Rank | Title                                                     | Institution              |   Rate | Match Score   |
|--------|-----------------------------------------------------------|--------------------------|--------|---------------|
|      1 | Practical Machine Learning                                | Johns Hopkins University |    4.3 | 66.2%         |
|      2 | Microsoft Azure Machine Learning                          | Microsoft                |    4.9 | 66.2%         |
|      3 | AI Workflow: Machine Learning, Visual Recognition and NLP | IBM                      |    4.3 | 66.1%         |
|      4 | Build Regression, Classification, and Clustering Models   | CertNexus                |    4.5 | 66.1%         |
|      5 | Supervised Machine Learning: Classification               | IBM                      |    4.8 | 65.1%         |
