# Making Recommendation Engine

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
jobs = pd.read_csv('/content/it_jobs.csv')

In [None]:
courses1 = pd.read_csv('/content/udemy_courses.csv')

In [None]:
courses2 = pd.read_csv('/content/coursera_courses.csv')

In [None]:
jobs.head()

Unnamed: 0,Job Title,Cleaned_Job_Skills
0,ACCESSIBILITY SPECIALIST,"['web', 'accessibility', 'guideline', 'html', ..."
1,ADMIN BIG DATA,"['big', 'data', 'management', 'hadoop', 'spark..."
2,AGILE PROJECT MANAGER,"['agile', 'methodology', 'scrum', 'kanban', 'p..."
3,ANDROID DEVELOPER,"['java', 'kotlin', 'android', 'sdk', 'mobile',..."
4,ANSIBLE AUTOMATION ENGINEER,"['ansible', 'automation', 'script', 'linux', '..."


In [None]:
courses1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20995 entries, 0 to 20994
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          20995 non-null  object 
 1   Summary        20993 non-null  object 
 2   Enrollment     20995 non-null  int64  
 3   Stars          20995 non-null  float64
 4   Rating         20995 non-null  int64  
 5   Link           20995 non-null  object 
 6   course_skills  20995 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 1.1+ MB


In [None]:
courses2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   course_title              1000 non-null   object 
 1   course_organization       1000 non-null   object 
 2   course_certificate_type   1000 non-null   object 
 3   course_time               1000 non-null   object 
 4   course_rating             994 non-null    float64
 5   course_reviews_num        994 non-null    object 
 6   course_difficulty         1000 non-null   object 
 7   course_url                1000 non-null   object 
 8   course_students_enrolled  959 non-null    object 
 9   course_skills             1000 non-null   object 
 10  course_summary            1000 non-null   object 
 11  course_description        999 non-null    object 
dtypes: float64(1), object(11)
memory usage: 93.9+ KB


In [None]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           294 non-null    object
 1   Cleaned_Job_Skills  294 non-null    object
dtypes: object(2)
memory usage: 4.7+ KB


In [None]:
!pip install pandas scikit-learn sentence-transformers

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch




Loading Datasets

In [None]:
jobs_df = jobs
coursera_df = courses2
udemy_df = courses1

## Normalize Coursera columns to align with Udemy

In [None]:
coursera_df.rename(columns={
    "course_title": "Title",
    "course_url": "Link",
    "course_skills": "course_skills"
}, inplace=True)

In [None]:
def parse_enrollment(enroll):
    if pd.isna(enroll):
        return 0
    try:
        cleaned = str(enroll).replace(",", "").replace("+", "")
        return int(cleaned)
    except:
        return 0

coursera_df["Enrollment"] = coursera_df["course_students_enrolled"].apply(parse_enrollment)
coursera_df["Stars"] = coursera_df["course_rating"].fillna(0)
coursera_df["Rating"] = coursera_df["Stars"].round().astype(int)

## Step 1: Role Normalization using jobs_df and embeddings

In [None]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

job_embeddings = model.encode(jobs_df["Job Title"].tolist(), convert_to_tensor=True)

def match_role(user_role):
    user_emb = model.encode(user_role, convert_to_tensor=True)
    sim_scores = util.cos_sim(user_emb, job_embeddings)[0]
    best_idx = sim_scores.argmax().item()
    best_role = jobs_df.iloc[best_idx]["Job Title"]
    return best_role

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Step 2: Skill Gap Analysis

In [None]:
def skill_gap(user_skills_dict, job_title):
    job_skills_str = jobs_df.loc[jobs_df["Job Title"] == job_title, "Cleaned_Job_Skills"].values[0]
    # Convert string list representation to actual list if needed, else split by commas
    if isinstance(job_skills_str, str):
        # Remove brackets and quotes then split by comma
        cleaned_str = job_skills_str.strip("[]").replace("'", "").replace('"', "")
        job_skills = [s.strip().lower() for s in cleaned_str.split(",") if s.strip()]
    else:
        job_skills = []

    # Normalize user skills keys
    user_skills = {k.lower().strip(): v for k, v in user_skills_dict.items()}

    strong, weak, missing = [], [], []

    for skill, rating in user_skills.items():
        if skill in job_skills:
            if rating >= 3:
                strong.append(skill)
            else:
                weak.append(skill)
    for skill in job_skills:
        if skill not in user_skills:
            missing.append(skill)

    return strong, weak, missing

## Step 3: Preprocessing Udemy dataset

In [None]:
udemy_df["course_skill_list"] = udemy_df["course_skills"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
udemy_df["title_tokens"] = udemy_df["Title"].apply(lambda x: str(x).lower().split())
udemy_df["combined_tokens"] = udemy_df.apply(lambda row: row["course_skill_list"] + row["title_tokens"], axis=1)


In [None]:
coursera_df["course_skill_list"] = coursera_df["course_skills"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
coursera_df["title_tokens"] = coursera_df["Title"].apply(lambda x: str(x).lower().split())
coursera_df["combined_tokens"] = coursera_df.apply(lambda row: row["course_skill_list"] + row["title_tokens"], axis=1)


In [None]:
import itertools

# Helper to compute average embeddings per course based on tokens
def compute_embeddings_for_courses(df):
    all_tokens = list(set(itertools.chain.from_iterable(df["combined_tokens"].tolist())))
    token_embeddings = model.encode(all_tokens, convert_to_tensor=True)
    token_to_emb = {t: e for t, e in zip(all_tokens, token_embeddings)}

    def average_embedding(tokens):
        embs = [token_to_emb[t] for t in tokens if t in token_to_emb]
        if embs:
            return torch.mean(torch.stack(embs), dim=0)
        else:
            return None

    df["course_emb"] = df["combined_tokens"].apply(average_embedding)
    return df

udemy_df = compute_embeddings_for_courses(udemy_df)
coursera_df = compute_embeddings_for_courses(coursera_df)

## Step 4: Course Recommendation function for any dataset

In [None]:
def recommend_courses_fast(dataframe, missing_skills, weak_skills, top_n=20):
    target_skills = list(set(missing_skills + weak_skills))
    if not target_skills:
        return []

    target_embeddings = model.encode(target_skills, convert_to_tensor=True)
    target_emb = torch.mean(target_embeddings, dim=0, keepdim=True)

    course_scores = []
    for _, row in dataframe.iterrows():
        course_emb = row.get("course_emb")
        if course_emb is None:
            continue
        sim = util.cos_sim(target_emb, course_emb).item()
        if sim >= 0.5:
            title = row.get("Title", "")
            link = row.get("Link", "")
            enrollment = int(row.get("Enrollment", 0))
            rating = float(row.get("Stars", 0))
            course_scores.append((title, link, enrollment, rating, round(sim, 2)))

    course_scores = sorted(course_scores, key=lambda x: x[4], reverse=True)
    return course_scores[:top_n]

## Step 5: Example usage with user input

In [None]:
user_role = "Machine Learning"
user_skills = {"python": 4, "css": 2, "data": 3}

matched_role = match_role(user_role)
strong, weak, missing = skill_gap(user_skills, matched_role)

udemy_recs = recommend_courses_fast(udemy_df, missing, weak, top_n=20)
coursera_recs = recommend_courses_fast(coursera_df, missing, weak, top_n=20)

print("Matched Role:", matched_role)
print("Strong Skills:", strong)
print("Weak Skills:", weak)
print("Missing Skills:", missing)

print("\nTop 20 Udemy Courses:")
for title, url, enrollment, rating, score in udemy_recs:
    print(f"- {title} ({url}) | Enrollment: {enrollment}, Rating: {rating}")

print("\nTop 20 Coursera Courses:")
for title, url, enrollment, rating, score in coursera_recs:
    print(f"- {title} ({url}) | Enrollment: {enrollment}, Rating: {rating}")

Matched Role: MACHINE LEARNING ENGINEER
Strong Skills: ['python', 'data']
Weak Skills: []
Missing Skills: ['machine', 'learning', 'algorithm', 'analysis', 'model', 'deployment']

Top 20 Udemy Courses:
- Testing and Monitoring Machine Learning Model Deployments (https://www.udemy.com/course/testing-and-monitoring-machine-learning-model-deployments/) | Enrollment: 1654, Rating: 4.6
- Machine Learning Optimization Using Genetic Algorithm (https://www.udemy.com/course/machine-learning-optimization-using-genetic-algorithm/) | Enrollment: 1551, Rating: 4.5
- Machine Learning Classification Algorithms using MATLAB (https://www.udemy.com/course/supervised-machine-learning-classification-using-matlab/) | Enrollment: 2496, Rating: 3.8
- Machine Learning Guide: Learn Machine Learning Algorithms (https://www.udemy.com/course/machine-learning-algorithms/) | Enrollment: 10035, Rating: 3.5
- Machine Learning with Python|Business Applications|AI Robot (https://www.udemy.com/course/machine-learning-wit

## Saving Files Locally

In [None]:
udemy_df.to_pickle("udemy_with_emb.pkl")
coursera_df.to_pickle("coursera_with_emb.pkl")
jobs_df.to_pickle("jobs.pkl")

In [None]:
import torch
torch.save(job_embeddings, "job_embeddings.pt")

In [None]:
from google.colab import files

files.download("udemy_with_emb.pkl")
files.download("coursera_with_emb.pkl")
files.download("jobs.pkl")
files.download("job_embeddings.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>