In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
df = pd.read_excel("/content/Copy of gen-AI_with_job_titles____.xlsx")
df = df.drop(columns=[col for col in ['Certificate Title','Unnamed: 6'] if col in df.columns])
df = df.dropna(subset=['Interests','Skills','Job Title'])
df['Interests_list'] = df['Interests'].str.lower().apply(lambda x: re.split(r'[;,\\n]', x))
df['Interests_list'] = df['Interests_list'].apply(lambda lst: [i.strip() for i in lst if i.strip()])
df['Skills_list'] = df['Skills'].str.lower().apply(lambda x: re.split(r'[;,\\n]', x))
df['Skills_list'] = df['Skills_list'].apply(lambda lst: [s.strip() for s in lst if s.strip()])
le_ug_degree = LabelEncoder()
X_deg = le_ug_degree.fit_transform(df['UG degree'])
X_deg = X_deg.reshape(-1,1)
le_ug_specialization = LabelEncoder()
X_spec = le_ug_specialization.fit_transform(df['UG Specialization'])
X_spec = X_spec.reshape(-1,1)
le_add_cert = LabelEncoder()
X_cert = le_add_cert.fit_transform(df['Additional certification'])
X_cert = X_cert.reshape(-1,1)
mlb_interests = MultiLabelBinarizer()
X_interests = mlb_interests.fit_transform(df['Interests_list'])
mlb_skills = MultiLabelBinarizer()
X_skills = mlb_skills.fit_transform(df['Skills_list'])
X = np.hstack([X_deg, X_spec, X_cert, X_interests, X_skills])
le_target = LabelEncoder()
y = le_target.fit_transform(df['Job Title'])
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X, y)
os.makedirs('model', exist_ok=True)
joblib.dump(model, 'model/career_model.pkl')
joblib.dump(mlb_skills, 'model/mlb_skills.pkl')
joblib.dump(mlb_interests, 'model/mlb_interests.pkl')
joblib.dump(le_ug_degree, 'model/le_ug_degree.pkl')
joblib.dump(le_ug_specialization, 'model/le_ug_specialization.pkl')
joblib.dump(le_add_cert, 'model/le_additional_cert.pkl')
joblib.dump(le_target, 'model/le_target.pkl')


['model/le_target.pkl']

In [None]:
import numpy as np
import re
import joblib
model = joblib.load('model/career_model.pkl')
mlb_skills = joblib.load('model/mlb_skills.pkl')
mlb_interests = joblib.load('model/mlb_interests.pkl')
le_ug_degree = joblib.load('model/le_ug_degree.pkl')
le_ug_specialization = joblib.load('model/le_ug_specialization.pkl')
le_additional_cert = joblib.load('model/le_additional_cert.pkl')
le_target = joblib.load('model/le_target.pkl')
ug_degree = input("Enter UG degree: ").strip()
ug_spec = input("Enter UG specialization: ").strip()
cert = input("Enter Additional certification: ").strip()
interests_input = input("Enter Interests (semicolon-separated): ")
skills_input = input("Enter Skills (semicolon-separated): ")
user_interests_list = [i.strip().lower() for i in re.split(r'[;,\\n]', interests_input) if i.strip()]
user_skills_list = [s.strip().lower() for s in re.split(r'[;,\\n]', skills_input) if s.strip()]
deg_classes = list(le_ug_degree.classes_)
deg_lower = [v.lower() for v in deg_classes]
if ug_degree.lower() in deg_lower:
    mapped = deg_classes[deg_lower.index(ug_degree.lower())]
    deg_enc = le_ug_degree.transform([mapped]).reshape(1, -1)
else:
    print("Warning: UG degree not seen in training, using fallback.")
    deg_enc = le_ug_degree.transform([deg_classes[0]]).reshape(1, -1)
spec_classes = list(le_ug_specialization.classes_)
spec_lower = [v.lower() for v in spec_classes]
if ug_spec.lower() in spec_lower:
    mapped = spec_classes[spec_lower.index(ug_spec.lower())]
    spec_enc = le_ug_specialization.transform([mapped]).reshape(1, -1)
else:
    print("Warning: UG specialization not seen in training, using fallback.")
    spec_enc = le_ug_specialization.transform([spec_classes[0]]).reshape(1, -1)
cert_classes = list(le_additional_cert.classes_)
cert_lower = [v.lower() for v in cert_classes]
if cert.lower() in cert_lower:
    mapped = cert_classes[cert_lower.index(cert.lower())]
    cert_enc = le_additional_cert.transform([mapped]).reshape(1, -1)
else:
    print("Warning: Additional certification not seen in training, using fallback.")
    cert_enc = le_additional_cert.transform([cert_classes[0]]).reshape(1, -1)
user_interests_enc = mlb_interests.transform([user_interests_list])
user_skills_enc = mlb_skills.transform([user_skills_list])
X_input = np.hstack([deg_enc, spec_enc, cert_enc, user_interests_enc, user_skills_enc])
probs = model.predict_proba(X_input)[0]
top_idx = np.argsort(probs)[::-1][:5]
top_probs = probs[top_idx]
top_titles = le_target.inverse_transform(top_idx)
print("Top job recommendations:")
for title, prob in zip(top_titles, top_probs):
    print(f"{title}: {prob*100:.2f}% match")
skills_to_links = {
    'python': 'https://www.coursera.org/learn/python',
    'r': 'https://www.coursera.org/learn/r-programming',
    'java': 'https://www.coursera.org/specializations/java-programming',
    'c++': 'https://www.coursera.org/learn/c-plus-plus-a',
    'sql': 'https://www.coursera.org/learn/learn-sql-basics-data-science',
    'pandas': 'https://www.coursera.org/learn/data-analysis-with-python',
    'numpy': 'https://www.coursera.org/learn/data-analysis-with-python',
    'matplotlib': 'https://www.coursera.org/learn/python-for-data-visualization',
    'seaborn': 'https://www.coursera.org/learn/python-for-data-visualization',
    'machine learning': 'https://www.coursera.org/learn/machine-learning',
    'deep learning': 'https://www.coursera.org/specializations/deep-learning',
    'natural language processing': 'https://www.coursera.org/specializations/natural-language-processing',
    'computer vision': 'https://www.coursera.org/specializations/computer-vision',
    'tensorflow': 'https://www.coursera.org/learn/deep-learning-tensorflow',
    'keras': 'https://www.coursera.org/specializations/deep-learning',
    'pytorch': 'https://www.coursera.org/learn/deep-neural-network',
    'scikit-learn': 'https://www.coursera.org/learn/machine-learning-with-python',
    'statistics': 'https://www.coursera.org/learn/probability-statistics',
    'probability': 'https://www.coursera.org/learn/probability-theory-for-data-science',
    'linear algebra': 'https://www.coursera.org/learn/linear-algebra-machine-learning',
    'calculus': 'https://www.coursera.org/learn/calculus-essentials',
    'tableau': 'https://www.coursera.org/learn/data-visualization-tableau',
    'power bi': 'https://www.coursera.org/learn/analytics-tools',
    'excel': 'https://www.coursera.org/learn/excel',
    'hadoop': 'https://www.coursera.org/learn/hadoop-platform',
    'spark': 'https://www.coursera.org/learn/scala-spark-big-data',
    'aws': 'https://www.coursera.org/learn/aws-cloud',
    'azure': 'https://www.coursera.org/learn/azure-fundamentals',
    'docker': 'https://www.coursera.org/learn/docker',
    'git': 'https://www.coursera.org/learn/introduction-git-github',
    'nlp': 'https://www.coursera.org/specializations/natural-language-processing',
    'data visualization': 'https://www.coursera.org/learn/python-for-data-visualization',
    'communication': 'https://www.coursera.org/learn/effective-communication',
    'leadership': 'https://www.coursera.org/learn/leadership-skills',
    'data analysis': 'https://www.coursera.org/learn/pandas-data-analysis'
}
job_to_skills = {
    'Data Scientist': ['python','r','machine learning','data visualization','statistics','sql','pandas','numpy'],
    'Business Analyst': ['excel','sql','tableau','data visualization','communication'],
    'HR Manager': ['communication','leadership','recruitment'],
    'Financial Analyst': ['excel','statistics','accounting','financial modeling'],
    'Product Manager': ['leadership','communication','data analysis','agile'],
    'Software Engineer': ['python','java','c++','data structures','algorithms'],
    'Sales Representative': ['communication','crm','negotiation'],
    'Graphic Designer': ['photoshop','illustrator','creativity'],
    'Cyber Security Analyst': ['python','linux','networking','security','encryption'],
    'Marketing Executive': ['seo','google analytics','content marketing','social media']
}
user_skills_set = set(user_skills_list)
for title in top_titles:
    req = job_to_skills.get(title, [])
    missing = [s for s in req if s not in user_skills_set]
    if missing:
        print(f"Missing skills for {title}:")
        for skill in missing:
            link = skills_to_links.get(skill)
            if link:
                print(f"- {skill}: {link}")


Enter UG degree: B.Tech
Enter UG specialization: Computer Science
Enter Additional certification: Artificial Intelligence
Enter Interests (semicolon-separated): Machile Learning
Enter Skills (semicolon-separated): Python; Java; Machine Learning; c; c++
Top job recommendations:
Data Scientist: 68.67% match
Business Analyst: 16.00% match
Software Engineer: 6.67% match
Financial Analyst: 3.33% match
Graphic Designer: 2.67% match
Missing skills for Data Scientist:
- python: https://www.coursera.org/learn/python
- r: https://www.coursera.org/learn/r-programming
- machine learning: https://www.coursera.org/learn/machine-learning
- data visualization: https://www.coursera.org/learn/python-for-data-visualization
- statistics: https://www.coursera.org/learn/probability-statistics
- sql: https://www.coursera.org/learn/learn-sql-basics-data-science
- pandas: https://www.coursera.org/learn/data-analysis-with-python
- numpy: https://www.coursera.org/learn/data-analysis-with-python
Missing skills fo

