In [116]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

In [117]:
career_df = pd.read_csv("generated_career_paths.csv")
career_df.drop_duplicates(inplace=True)
career_df.head(3)

Unnamed: 0,path_id,level,role,required_skills,avg_salary,industry
0,Doctor Path,1,Medical Intern,"Anatomy,Clinical Practice,Patient Care",50000,Healthcare
2,Blockchain Path,1,Smart Contract Dev Intern,"Solidity,Remix IDE",40000,Web3
3,Film Production Path,1,Production Assistant,"Coordination,Script Reading,Equipment Handling",30000,Media


In [118]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [119]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Dwi Wahyu
[nltk_data]     Lestari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Dwi Wahyu
[nltk_data]     Lestari\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [120]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize(text):
    text = text.lower()

    # remove numbers and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()

    # remove stopwords
    words = [w for w in words if w not in stop_words]

    # lemmatize
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

In [121]:
career_df['required_skills'] = career_df['required_skills'].apply(clean_and_lemmatize)
career_df.head(3)

Unnamed: 0,path_id,level,role,required_skills,avg_salary,industry
0,Doctor Path,1,Medical Intern,anatomyclinical practicepatient care,50000,Healthcare
2,Blockchain Path,1,Smart Contract Dev Intern,solidityremix ide,40000,Web3
3,Film Production Path,1,Production Assistant,coordinationscript readingequipment handling,30000,Media


In [122]:
career_df["combined_text"] = (
    career_df["path_id"].astype(str) + " " +
    career_df["role"].astype(str) + " " +
    career_df["required_skills"].astype(str)
)

In [123]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(career_df["combined_text"])

In [124]:
model_data = {
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
    "dataframe": career_df
}

In [125]:
joblib.dump(model_data, "career_recommender_tfidf.joblib")

['career_recommender_tfidf.joblib']

In [126]:
model = joblib.load("career_recommender_tfidf.joblib")
vectorizer = model["vectorizer"]
tfidf_matrix = model["tfidf_matrix"]
career_df = model["dataframe"]

In [127]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_from_model(query_text: str, preferred_industries: list = None, top_n: int = 5):
    query_vec = vectorizer.transform([query_text])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

    df_copy = career_df.copy()
    df_copy["similarity"] = similarities

    if preferred_industries:
        df_copy = df_copy[df_copy["industry"].isin(preferred_industries)]

    recommended = df_copy.sort_values(by=["similarity", "level"], ascending=[False, True])
    return recommended[["path_id", "level", "role", "required_skills", "avg_salary", "industry", "similarity"]].head(top_n)


In [128]:
recommend_from_model("data analysis, python, sql, statistics")

Unnamed: 0,path_id,level,role,required_skills,avg_salary,industry,similarity
32,Statistics Path,1,Data Assistant,excelrdata entry,40000,Science,0.519544
12,Cybersecurity Path,1,Security Analyst,networkingpythonlog analysis,65000,Tech,0.25491
113,Statistics Path,3,Senior Statistician,modelingpublicationsleadership,95000,Science,0.252109
75,Statistics Path,2,Statistician,hypothesis testingrspss,70000,Science,0.233821
7,Game Dev Path,1,Game Tester,bug reportinggameplay analysis,30000,Gaming,0.193119
