In [5]:
from dotenv import load_dotenv
import os

import numpy as np
import pandas as pd
from huggingface_hub import InferenceClient
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# load the .env file into environment variables
load_dotenv()

HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
print("Token loaded?", HF_TOKEN is not None)

Token loaded? True


In [7]:
SKILLS = [
    "Python", "SQL", "Excel", "Tableau", "Pandas",
    "Power BI", "R", "AWS", "Git", "Spark",
    "Docker", "Linux", "REST APIs", "NumPy", "Scikit-learn"
]

In [None]:
HF_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY") 
client = InferenceClient(token=HF_TOKEN)

def embed_texts(texts):
    vecs = client.feature_extraction(texts, model=HF_MODEL)
    vecs = np.array(vecs, dtype="float32")
   # Normalising (L2 norm)
    norms = np.linalg.norm(vecs, axis=1, keepdims=True)
    norms[norms == 0.0] = 1.0
    return vecs / norms

skill_emb = embed_texts(SKILLS)

In [9]:
print("Number of skills:", len(SKILLS))
print("Embedding shape:", skill_emb.shape)

Number of skills: 15
Embedding shape: (15, 384)


In [None]:
def detect_skills(text, threshold=0.45):
    chunks = [c.strip() for c in text.replace("\n", ". ").split(".") if c.strip()]
    if not chunks:
        return set()
    chunk_emb = embed_texts(chunks)
    sims = cosine_similarity(chunk_emb, skill_emb)  # (n_chunks x n_skills)
    present = set()
    for j, skill in enumerate(SKILLS):
        if sims[:, j].max() >= threshold:
            present.add(skill)
    return present