In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


# for CLASSIFICATION 

df = pd.read_csv("final_risk_predictions.csv")


#  MODEL-BASED RISK
df["Risk_Category"] = df["Model_Prediction"]


#  EMBEDDINGS (JOB TITLE)
model = SentenceTransformer("all-MiniLM-L6-v2")
job_titles = df["Job titiles"].tolist()
job_embeddings = model.encode(job_titles, convert_to_numpy=True)


#  DOMAIN ENCODING
encoder = OneHotEncoder(sparse_output=False)
domain_encoded = encoder.fit_transform(df[["Domain"]])


numeric = df[["Tasks", "AI models", "AI_Workload_Ratio"]]
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(numeric)


# SIMILARITY MATRICES

job_sim = cosine_similarity(job_embeddings)
domain_sim = cosine_similarity(domain_encoded)
feat_sim = cosine_similarity(numeric_scaled)

hybrid_sim = (
    0.45 * job_sim +
    0.35 * domain_sim +
    0.20 * feat_sim
)

recommendations = []

for i in range(len(df)):
    if df.iloc[i]["Risk_Category"] == "High":

        sims = list(enumerate(hybrid_sim[i]))
        sims = sorted(sims, key=lambda x: x[1], reverse=True)

        top = [
            (idx, score) for idx, score in sims
            if df.iloc[idx]["Risk_Category"] != "High" and idx != i
        ][:3]

        for idx, score in top:
            recommendations.append({
                "High_Risk_Job": df.iloc[i]["Job titiles"],
                "High_Risk_Domain": df.iloc[i]["Domain"],
                "Alternative_Job": df.iloc[idx]["Job titiles"],
                "Alternative_Domain": df.iloc[idx]["Domain"],
                "Similarity": round(score, 4)
            })


rec_df = pd.DataFrame(recommendations)
rec_df.to_csv("final_hybrid_recommendations.csv", index=False)
