In [53]:
#importing needed libraries
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

In [54]:
#sample datar
data = pd.DataFrame({
    "Role": ["Data Scientist", "ML Engineer", "Data Analyst", "Data Engineer", "AI Researcher", "Business Analyst", "NLP Engineer"],
    "Skills": [
        "Python, Statistics, Machine Learning, Data Visualization",
        "Python, Machine Learning, Deployment, Algorithms",
        "SQL, Python, Data Visualization, Excel",
        "Python, SQL, ETL, Cloud Computing",
        "Python, Deep Learning, Machine Learning, Algorithms",
        "Excel, SQL, Data Visualization, Business Intelligence",
        "Python, NLP, Machine Learning, Deep Learning"
    ]
})

In [55]:
data

Unnamed: 0,Role,Skills
0,Data Scientist,"Python, Statistics, Machine Learning, Data Vis..."
1,ML Engineer,"Python, Machine Learning, Deployment, Algorithms"
2,Data Analyst,"SQL, Python, Data Visualization, Excel"
3,Data Engineer,"Python, SQL, ETL, Cloud Computing"
4,AI Researcher,"Python, Deep Learning, Machine Learning, Algor..."
5,Business Analyst,"Excel, SQL, Data Visualization, Business Intel..."
6,NLP Engineer,"Python, NLP, Machine Learning, Deep Learning"


In [56]:
#converting to dictionary for faster lookup
skills_data = {row['Role']: set(row['Skills'].split(', ')) for _, row in data.iterrows()}

In [57]:
skills_data

{'Data Scientist': {'Data Visualization',
  'Machine Learning',
  'Python',
  'Statistics'},
 'ML Engineer': {'Algorithms', 'Deployment', 'Machine Learning', 'Python'},
 'Data Analyst': {'Data Visualization', 'Excel', 'Python', 'SQL'},
 'Data Engineer': {'Cloud Computing', 'ETL', 'Python', 'SQL'},
 'AI Researcher': {'Algorithms',
  'Deep Learning',
  'Machine Learning',
  'Python'},
 'Business Analyst': {'Business Intelligence',
  'Data Visualization',
  'Excel',
  'SQL'},
 'NLP Engineer': {'Deep Learning', 'Machine Learning', 'NLP', 'Python'}}

<h2>Input your role after below step</h2>

In [58]:
#taking input role
input_role = input("Enter Rour Role: ")

#converting input to camel case
def to_camel_case(text):
    return ' '.join(word.capitalize() for word in text.split())
input_role = to_camel_case(input_role)

Enter Rour Role:  Data Scientist


In [59]:
if input_role not in skills_data:
    print(f"Role '{input_role}' not found. Please enter a valid role from the dataset.")
else:
    all_roles = list(skills_data.keys())

    #Extracting unique skill
    all_skills = set()
    for skills in skills_data.values():
        all_skills.update(skills)
    all_skills = list(all_skills)

    #converting skill data into binary vectors
    mlb = MultiLabelBinarizer(classes=all_skills)
    encoded_skills = mlb.fit_transform(skills_data.values())

    #extracting input_role index and binary vector
    input_index = all_roles.index(input_role)
    input_vector = encoded_skills[input_index]

    #computing jaccard similarity for the input role to each role in data
    similarities = [
        (role, jaccard_score(input_vector, encoded_skills[i])) for i, role in enumerate(all_roles) if role != input_role
    ]

    #sorting similarities by jaccard_score
    ranked_roles = sorted(similarities, key=lambda x: x[1], reverse=True)

    #extracting top 3 roles by jaccard_score
    top_roles = [role for role, _ in ranked_roles[:3]]
    
    print(f"Recommended roles for {input_role}: {top_roles}")

Recommended roles for Data Scientist: ['ML Engineer', 'Data Analyst', 'AI Researcher']
