In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load required resources
nltk.download("stopwords")
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load datasets
df_ibm = pd.read_csv("https://raw.githubusercontent.com/WinNatch/IBM_SKILL/main/IBM%20Course%20Data%20_%20Rating%20-%20Data.csv", encoding="ISO-8859-1")
df_jobs = pd.read_csv("https://raw.githubusercontent.com/WinNatch/IBM_SKILL/main/Restructured_Job_Skills_Data.csv")
df_university = pd.read_json("https://raw.githubusercontent.com/WinNatch/IBM_SKILL/main/UoB_Business_School_Data.json", lines=True)

# IBM Data Cleaning
df_ibm = df_ibm[df_ibm["Level"].str.lower() != "application"]
df_ibm = df_ibm.rename(columns={"Desciption": "Description"})

# Select relevant columns
df_ibm = df_ibm[["Course_Name", "Description", "ILO", "Tags", "Duration", "Rating", "Level", "URL"]]
df_jobs = df_jobs[["Job Role", "Skill", "Percentage"]]
df_university = df_university[["Programme", "Overview", "Programme_Structure", "Career_Prospects", "Programme Catalogue", "Programme Details Mixed"]]

# Convert rating to numeric and drop NaN values
df_ibm["Rating"] = pd.to_numeric(df_ibm["Rating"], errors='coerce')
df_ibm = df_ibm.dropna(subset=["Rating"])

# Custom stopwords
custom_stopwords = set(stopwords.words("english")).union({
    "in", "on", "at", "by", "for", "with", "about", "as", "into", "through", "between",
    "and", "or", "but", "because", "so", "although", "unit",
    "like", "just", "very", "also", "more", "way", "course", "know", "youll", "team", "field", "learn", "skill",
    "learning", "step", "help", "cause", "human", "follow", "car", "type", "difference", "world", "discover",
    "design", "tool", "describe", "differentiate", "define", "explain", "learns", "result", "rule", "need",
    "value", "use", "identify", "explore", "create", "role", "function", "compare", "recognize"
})

# Define software & technique keywords
software_keywords = {kw.lower() for kw in {
    "Python", "Tableau", "Excel", "Power BI", "SQL", "Structured Query Language",
    "TensorFlow", "PyTorch", "AWS", "Google Cloud", "Azure", "Hadoop", "Spark",
    "Kubernetes", "Docker", "Jupyter", "PostgreSQL", "MongoDB", "MySQL", "BigQuery",
    "Snowflake", "SAS", "STATA", "MATLAB", "Scikit-learn", "Pandas", "NumPy",
    "Seaborn", "ggplot2", "Django", "Flask", "FastAPI", "IBM Watsonx", "IBM",
    "CSS", "HTML", "JavaScript", "Watson", "Studio", "Operations",
    "Agile", "UX", "Review", "R"
}}

technique_keywords = {kw.lower() for kw in {
    "NLP", "Natural Language Processing", "CNN", "Convolutional Neural Network",
    "RNN", "Recurrent Neural Network", "LSTM", "Long Short-Term Memory",
    "Generative Adversarial Network", "XGBoost", "Reinforcement Learning",
    "Gradient Boosting", "SVM", "Support Vector Machine", "Decision Tree",
    "Random Forest", "Clustering", "K-Means", "PCA", "Principal Component Analysis",
    "Feature Engineering", "Data Augmentation", "Machine Learning", "Deep Learning",
    "Neural Networks", "Cybersecurity", "Threats", "Sustainability", "Ethics", "Regression", "chatbot"
}}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# User input
print("\n\U0001F4E2 Please provide details about yourself for course recommendations:")
user_university_course = input("\n1️⃣ What is the university course you are currently studying? (Required)\nYour answer: ").strip()
user_career_aspiration = input("\n2️⃣ What is your career aspiration? (Optional - Type 'Don't know' to skip)\nYour answer: ").strip()
user_skill_interest = input("\n3️⃣ What are your interesting skills? (Optional - Type 'Don't know' to skip)\nYour answer: ").strip()

# Combine user inputs
user_input_combined = " ".join(filter(None, [user_university_course, user_career_aspiration, user_skill_interest]))

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^\w\s]', '', text).lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Preprocess data
df_ibm["Processed_Text"] = df_ibm[["Description", "ILO", "Tags"]].astype(str).apply(lambda x: " ".join(x), axis=1).apply(preprocess_text)
df_university["Processed_Text"] = df_university[["Overview", "Programme_Structure", "Career_Prospects", "Programme Catalogue", "Programme Details Mixed"]].astype(str).apply(lambda x: " ".join(x), axis=1).apply(preprocess_text)

df_jobs["Processed_Text"] = df_jobs[["Job Role", "Skill"]].astype(str).apply(lambda x: " ".join(x), axis=1).apply(preprocess_text)

# Compute job role similarity using TF-IDF first
vectorizer_jobs = TfidfVectorizer()
tfidf_jobs = vectorizer_jobs.fit_transform(df_jobs["Job Role"])
tfidf_user_job = vectorizer_jobs.transform([preprocess_text(user_input_combined)])
similarity_jobs_tfidf = cosine_similarity(tfidf_user_job, tfidf_jobs).flatten()

# Compute job role similarity using BERT
user_embedding = bert_model.encode(user_input_combined, convert_to_tensor=True)
job_embeddings = bert_model.encode(df_jobs["Job Role"].tolist(), convert_to_tensor=True)
job_similarities = util.pytorch_cos_sim(user_embedding, job_embeddings).flatten().cpu().numpy()
job_similarities_bert = util.pytorch_cos_sim(user_embedding, job_embeddings).flatten().cpu().numpy()

# Combine both similarity measures
final_job_scores = (similarity_jobs_tfidf * 0.4) + (job_similarities_bert * 0.6)
best_job_index = np.argmax(final_job_scores)
best_job_score = final_job_scores[best_job_index]
best_matching_job = df_jobs.iloc[best_job_index]["Job Role"]
best_matching_job_skills = df_jobs[df_jobs["Job Role"] == best_matching_job]["Skill"].tolist()

# Compute TF-IDF similarity for university programs
vectorizer = TfidfVectorizer()
combined_corpus = list(df_university["Processed_Text"]) + [preprocess_text(user_input_combined)]
tfidf_university = vectorizer.fit_transform(combined_corpus[:-1])
tfidf_user = vectorizer.transform([combined_corpus[-1]])
similarity_university = cosine_similarity(tfidf_user, tfidf_university).flatten()

best_university_index = np.argmax(similarity_university)
best_university_score = similarity_university[best_university_index]
best_matching_university = df_university.iloc[best_university_index]["Programme"]

# Compute TF-IDF similarity for IBM courses
tfidf_ibm = vectorizer.fit_transform(df_ibm["Processed_Text"])
tfidf_user_ibm = vectorizer.transform([preprocess_text(user_input_combined)])
similarity_ibm = cosine_similarity(tfidf_user_ibm, tfidf_ibm).flatten()

df_ibm["Relevance_Score"] = similarity_ibm
df_ibm = df_ibm[df_ibm["Rating"] >= 4.0]

# Increase Weight for Technical Keywords**
boost_factor = 1.5  # Increased to give higher weight to tech skills
def boost_matching_keywords(text, base_score):
    tokens = text.lower().split()
    keyword_count = sum(1 for word in tokens if word in software_keywords or word in technique_keywords)
    return base_score * (1 + (boost_factor * keyword_count))  # Multiply instead of add

df_ibm["Relevance_Score"] = df_ibm.apply(lambda row: boost_matching_keywords(row["Processed_Text"], row["Relevance_Score"]), axis=1)

# Dynamic weight adjustment based on similarity scores
weights = {'ibm_course': 0.4, 'job_role': 0.3, 'university_program': 0.3}

if best_job_score < 0.4:
    weights['ibm_course'] += weights['job_role'] * 0.5
    weights['university_program'] += weights['job_role'] * 0.5
    weights['job_role'] = 0.0

# Normalize weights
total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}

# Compute final relevance score
df_ibm["Final_Relevance_Score"] = (
    df_ibm["Relevance_Score"] * weights['ibm_course'] +
    best_job_score * weights['job_role'] +
    best_university_score * weights['university_program'])

# Normalize scores to 0-100% scale
df_ibm["Matching_Percentage"] = (df_ibm["Final_Relevance_Score"] / df_ibm["Final_Relevance_Score"].max()) * 100
df_ibm["Matching_Percentage"] = df_ibm["Matching_Percentage"].apply(lambda x: min(100, max(10, x)))

# Ensure percentages look reasonable
df_ibm["Matching_Percentage"] = df_ibm["Matching_Percentage"].apply(lambda x: min(100, max(10, x)))

# Sort and get top recommendations
top_recommendations = df_ibm.sort_values(by="Final_Relevance_Score", ascending=False).head(3)

# Print best-matching job and university
print("\n🎯 **Best-Matching Job Role:**", best_matching_job, f"(Score: {best_job_score:.2f})")
print("   🔹 **Top Skills for This Job:**", ", ".join(best_matching_job_skills))
print("🎓 **Best-Matching University Programme:**", best_matching_university, f"(Score: {best_university_score:.2f})")

# Print top IBM course recommendations
print("\n🔹 **Top 3 IBM Courses Recommended for You:**\n")
for _, row in top_recommendations.iterrows():
    matching_keywords = [kw for kw in software_keywords.union(technique_keywords) if kw in row["Processed_Text"]]
    print(f"📚 **{row['Course_Name']}** ({row['Duration']}, Rating: {row['Rating']})")
    print(f"   🔹 **Course Level:** {row.get('Level', 'Unknown')}")
    print(f"   🔹 **Course URL:** {row.get('URL', 'No URL available')}")
    print(f"   🔹 **Matching Score:** {row['Matching_Percentage']:.2f}%")
    print(f"   🔹 **Matching Keywords:** {', '.join(matching_keywords) if matching_keywords else 'No matching keywords found'}\n")


📢 Please provide details about yourself for course recommendations:

1️⃣ What is the university course you are currently studying? (Required)
Your answer: Business Analytics

2️⃣ What is your career aspiration? (Optional - Type 'Don't know' to skip)
Your answer: Data Analyst

3️⃣ What are your interesting skills? (Optional - Type 'Don't know' to skip)
Your answer: Python

🎯 **Best-Matching Job Role:** data analyst data analyst (Score: 0.77)
   🔹 **Top Skills for This Job:** R, Excel, data, analysis, business, microsoft, teamwork communication, business analysis, complex, warehouse
🎓 **Best-Matching University Programme:** MSc Business Analytics (Score: 0.37)

🔹 **Top 3 IBM Courses Recommended for You:**

📚 **Overview of Data Tools and Languages** (1h 30m, Rating: 4.5)
   🔹 **Course Level:** Advanced
   🔹 **Course URL:** https://skills.yourlearning.ibm.com/activity/MDL-221
   🔹 **Matching Score:** 100.00%
   🔹 **Matching Keywords:** structured query language, python, r, studio, sql, ex