In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# 1. Load Data
# -----------------------------
df = pd.read_excel("school_dataset.xlsx")

# Normalize column names to lowercase and underscores
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("/", "_")

print("Available columns after renaming:", df.columns.tolist())
# Expected: ['stream', 'skills', 'jobs_opportunities', 'workshops_competitions']

# -----------------------------
# 2. Preprocessing
# -----------------------------
text_columns = ["skills", "jobs_opportunities", "workshops_competitions"]
for col in text_columns:
    df[col] = df[col].astype(str).fillna("")

# -----------------------------
# 3. Feature Engineering
# -----------------------------
# Normalize skills text
df["skills_normalized"] = df["skills"].str.lower()

# TF-IDF vectorizer on skills
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["skills_normalized"])

# -----------------------------
# 4. Recommendation Function
# -----------------------------
def get_missing_skills(user_skills, required_skills):
    user_set = set(user_skills.lower().split(", "))
    required_set = set(required_skills.lower().split(", "))
    return list(required_set - user_set)

def recommend_for_student(user_input_skills, top_n=5):
    user_skills = user_input_skills.lower()
    user_vector = vectorizer.transform([user_skills])

    # Similarity score
    cosine_similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()
    df["similarity_score"] = cosine_similarities

    # Final ranking
    top_matches = df.sort_values(by="similarity_score", ascending=False).head(top_n).copy()
    top_matches["missing_skills"] = top_matches["skills"].apply(lambda x: get_missing_skills(user_skills, x))

    return top_matches[["stream", "jobs_opportunities", "workshops_competitions", "similarity_score", "missing_skills"]]

# -----------------------------
# 5. Example Usage
# -----------------------------
user_input_skills = "Python, Problem Solving"
recommendations = recommend_for_student(user_input_skills, top_n=5)

print("Top Recommendations for Student:")
print(recommendations)


Available columns after renaming: ['stream', 'skills', 'jobs_opportunities', 'workshops_competitions']
Top Recommendations for Student:
     stream           jobs_opportunities   workshops_competitions  \
5      Arts  Junior Data Entry Assistant  Photography Competition   
19     Arts            Creative Designer         School Hackathon   
39     Arts         School Web Developer  Photography Competition   
1   Science   Student Research Assistant        Robotics Workshop   
3   Science   Student Research Assistant             Science Fair   

    similarity_score   missing_skills  
5           0.639493           [html]  
19          0.639493           [html]  
39          0.639493           [html]  
1           0.506984  [math modeling]  
3           0.506984  [math modeling]  


In [2]:
print(df.columns.tolist())

['stream', 'skills', 'jobs_opportunities', 'workshops_competitions', 'skills_normalized', 'similarity_score']


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# 1. Load Dataset
# -----------------------------
df = pd.read_excel("school_dataset.xlsx")

# Standardize column names
df.columns = df.columns.str.strip().str.title()
df.rename(columns={
    "Stream": "stream",
    "Skills": "skills",
    "Jobs/Opportunities": "jobs_opportunities",
    "Workshops/Competitions": "workshops_competitions"
}, inplace=True)

print("Available columns after renaming:", df.columns.tolist())

# -----------------------------
# 2. Preprocessing
# -----------------------------
text_columns = ["skills", "jobs_opportunities", "workshops_competitions"]
for col in text_columns:
    df[col] = df[col].astype(str).fillna("")

# Normalize skills
df["skills_normalized"] = df["skills"].str.lower().str.replace(r"[^a-zA-Z0-9, ]", "", regex=True)

# -----------------------------
# 3. Vectorization (TF-IDF)
# -----------------------------
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["skills_normalized"])
dataset_vocab = set(vectorizer.get_feature_names_out())  # vocabulary of dataset skills

# -----------------------------
# 4. Similarity Function
# -----------------------------
def get_missing_skills(user_skills, dataset_skills):
    return list(set(dataset_skills) - set(user_skills))

def recommend_for_student(user_input, top_n=5):
    # Clean and split user input
    user_input_cleaned = user_input.lower().replace(",", " ")
    user_skills = set(user_input_cleaned.split())

    # Identify seen vs unseen skills
    seen_skills = user_skills.intersection(dataset_vocab)
    unseen_skills = user_skills - dataset_vocab

    # Encode seen skills
    user_vector = vectorizer.transform([" ".join(seen_skills)])
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    df["similarity_score_tfidf"] = similarity_scores
    df["final_similarity"] = df["similarity_score_tfidf"]

    # Case 1: No overlap at all
    if len(seen_skills) == 0:
        print("\n⚠️ None of the input skills were found in the dataset.")
        fallback = pd.DataFrame({
            "stream": ["General", "General", "General"],
            "jobs_opportunities": [
                "Internships in Emerging Tech (AI, IoT, AR/VR)",
                "General Research Internships",
                "Community Volunteering / Leadership Roles"
            ],
            "workshops_competitions": [
                "Soft Skills & Communication Workshop",
                "Entrepreneurship & Innovation Bootcamp",
                "Hackathons / Ideathons (Open to All)"
            ],
            "final_similarity": [0, 0, 0],
            "missing_skills": [["Add skills to dataset"], ["Add skills to dataset"], ["Add skills to dataset"]],
            "unseen_input_skills": [list(unseen_skills)] * 3
        })
        return fallback

    # Case 2: Partial match (some seen, some unseen)
    top_matches = df.sort_values(by="final_similarity", ascending=False).head(top_n).copy()
    top_matches["missing_skills"] = top_matches["skills_normalized"].apply(lambda x: get_missing_skills(seen_skills, x.split()))
    top_matches["unseen_input_skills"] = [list(unseen_skills)] * len(top_matches)

    return top_matches[["stream", "jobs_opportunities", "workshops_competitions", "final_similarity", "missing_skills", "unseen_input_skills"]]

# -----------------------------
# 5. Example Usage
# -----------------------------
user_input_skills = "Python, HTML, Communication, QuantumComputing, Leadership"
recommendations = recommend_for_student(user_input_skills, top_n=5)

print("\n✅ Top Recommendations for Student:")
print(recommendations)


Available columns after renaming: ['stream', 'skills', 'jobs_opportunities', 'workshops_competitions']

✅ Top Recommendations for Student:
     stream           jobs_opportunities   workshops_competitions  \
5      Arts  Junior Data Entry Assistant  Photography Competition   
19     Arts            Creative Designer         School Hackathon   
39     Arts         School Web Developer  Photography Competition   
1   Science   Student Research Assistant        Robotics Workshop   
3   Science   Student Research Assistant             Science Fair   

    final_similarity             missing_skills  \
5           1.000000                    [html,]   
19          1.000000                    [html,]   
39          1.000000                    [html,]   
1           0.324212  [modeling, python,, math]   
3           0.324212  [modeling, python,, math]   

                              unseen_input_skills  
5   [leadership, communication, quantumcomputing]  
19  [leadership, communication, qua

In [6]:
from sentence_transformers import SentenceTransformer
import pickle

# Define model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Save model with pickle
with open("school.pkl", "wb") as f:
    pickle.dump(model, f)

# Load model back
with open("school.pkl", "rb") as f:
    loaded_model = pickle.load(f)

print("Model loaded:", type(loaded_model))


  from .autonotebook import tqdm as notebook_tqdm


Model loaded: <class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
