In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
employee_master = pd.read_csv("data/employee_master.csv")
employee_experience = pd.read_csv("data/employee_experience.csv")
client_projects = pd.read_csv("data/client_projects.csv")

In [3]:
employee_master.head()

Unnamed: 0,Employee_ID,Name,Department,Years_Experience,Current_Project_ID,Location,Skills
0,E101,Linda Johnson,DevOps,10.7,,Delhi,"AWS, Real-time Data Processing, Serverless Arc..."
1,E102,Michael Stevens DDS,Full Stack Dev,11.7,,Chennai,"Data Governance, Scrum Methodology, Blockchain"
2,E103,Cody Lee,Cloud Engineering,10.3,,Mumbai,"Data Quality Management, Big Data, Data Govern..."
3,E104,Tammy Downs,AI Research,10.9,,Mumbai,"NoSQL Database Management, .Net, GraphQL"
4,E105,Richard Wilson,Cloud Engineering,7.4,,Chennai,"Data Mesh Architecture, Robotic Process Automa..."


In [4]:
employee_experience.head()

Unnamed: 0,Employee_ID,Experience_Text
0,E101,"Worked on projects involving Microservices, Qu..."
1,E102,Worked on projects involving Data Privacy Comp...
2,E103,"Worked on projects involving Scrum, Data Wareh..."
3,E104,"Worked on projects involving CI/CD, Edge Compu..."
4,E105,Worked on projects involving Penetration Testi...


In [5]:
employee_master.head()

Unnamed: 0,Employee_ID,Name,Department,Years_Experience,Current_Project_ID,Location,Skills
0,E101,Linda Johnson,DevOps,10.7,,Delhi,"AWS, Real-time Data Processing, Serverless Arc..."
1,E102,Michael Stevens DDS,Full Stack Dev,11.7,,Chennai,"Data Governance, Scrum Methodology, Blockchain"
2,E103,Cody Lee,Cloud Engineering,10.3,,Mumbai,"Data Quality Management, Big Data, Data Govern..."
3,E104,Tammy Downs,AI Research,10.9,,Mumbai,"NoSQL Database Management, .Net, GraphQL"
4,E105,Richard Wilson,Cloud Engineering,7.4,,Chennai,"Data Mesh Architecture, Robotic Process Automa..."


In [6]:
employees = pd.merge(employee_master, employee_experience, on="Employee_ID", how="left")

employees.head()

Unnamed: 0,Employee_ID,Name,Department,Years_Experience,Current_Project_ID,Location,Skills,Experience_Text
0,E101,Linda Johnson,DevOps,10.7,,Delhi,"AWS, Real-time Data Processing, Serverless Arc...","Worked on projects involving Microservices, Qu..."
1,E102,Michael Stevens DDS,Full Stack Dev,11.7,,Chennai,"Data Governance, Scrum Methodology, Blockchain",Worked on projects involving Data Privacy Comp...
2,E103,Cody Lee,Cloud Engineering,10.3,,Mumbai,"Data Quality Management, Big Data, Data Govern...","Worked on projects involving Scrum, Data Wareh..."
3,E104,Tammy Downs,AI Research,10.9,,Mumbai,"NoSQL Database Management, .Net, GraphQL","Worked on projects involving CI/CD, Edge Compu..."
4,E105,Richard Wilson,Cloud Engineering,7.4,,Chennai,"Data Mesh Architecture, Robotic Process Automa...",Worked on projects involving Penetration Testi...


In [7]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    return str(text).lower().replace(",", " ")

In [8]:
employees["profile_text"] = (
    employees["Department"].fillna("") + " " +
    employees["Skills"].fillna("") + " " +
    employees["Experience_Text"].fillna("") + " " +
    employees["Years_Experience"].astype(str)
).apply(preprocess_text)

client_projects["project_text"] = (
    client_projects["Project_Description"].fillna("") + " " +
    client_projects["Required_Skills"].fillna("")
).apply(preprocess_text)

In [9]:
employees.head()

Unnamed: 0,Employee_ID,Name,Department,Years_Experience,Current_Project_ID,Location,Skills,Experience_Text,profile_text
0,E101,Linda Johnson,DevOps,10.7,,Delhi,"AWS, Real-time Data Processing, Serverless Arc...","Worked on projects involving Microservices, Qu...",devops aws real-time data processing serverl...
1,E102,Michael Stevens DDS,Full Stack Dev,11.7,,Chennai,"Data Governance, Scrum Methodology, Blockchain",Worked on projects involving Data Privacy Comp...,full stack dev data governance scrum methodol...
2,E103,Cody Lee,Cloud Engineering,10.3,,Mumbai,"Data Quality Management, Big Data, Data Govern...","Worked on projects involving Scrum, Data Wareh...",cloud engineering data quality management big...
3,E104,Tammy Downs,AI Research,10.9,,Mumbai,"NoSQL Database Management, .Net, GraphQL","Worked on projects involving CI/CD, Edge Compu...",ai research nosql database management .net g...
4,E105,Richard Wilson,Cloud Engineering,7.4,,Chennai,"Data Mesh Architecture, Robotic Process Automa...",Worked on projects involving Penetration Testi...,cloud engineering data mesh architecture robo...


In [10]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(
    employees["profile_text"].tolist() + client_projects["project_text"].tolist()
)

employee_tfidf = tfidf_matrix[:len(employees)]
project_tfidf = tfidf_matrix[len(employees):]

In [11]:
similarity_matrix = cosine_similarity(project_tfidf, employee_tfidf)

In [12]:
def compute_weighted_score(skill_score, emp_row, project_row,
                           w_skill=0.7, w_exp=0.2, w_location=0.1):
    # Base Skill Similarity (cosine similarity)
    score = skill_score * w_skill

    # Years of Experience (normalize on a scale of 0–1)
    exp_score = min(float(emp_row["Years_Experience"]) / 15.0, 1.0)
    score += exp_score * w_exp

    # Location Match (1 if same, else 0)
    loc_score = 1.0 if emp_row["Location"].lower() == project_row["Location"].lower() else 0.0
    score += loc_score * w_location

    return round(score, 3)  

In [13]:
top_n = 3
recommendations = {}

for proj_idx, project in client_projects.iterrows():
    sims = list(enumerate(similarity_matrix[proj_idx]))
    ranked = []
    for emp_idx, skill_score in sims:
        emp_row = employees.iloc[emp_idx]
        weighted_score = compute_weighted_score(skill_score, emp_row, project)
        ranked.append((emp_idx, weighted_score))

    # Sort employees by final weighted score
    ranked_sorted = sorted(ranked, key=lambda x: x[1], reverse=True)[:top_n]

    recommended_employees = []
    for emp_idx, score in ranked_sorted:
        emp_row = employees.iloc[emp_idx]
        recommended_employees.append({
            "employee_id": emp_row["Employee_ID"],
            "employee_name": emp_row["Name"],
            "department": emp_row["Department"],
            "skills": emp_row["Skills"],
            "location": emp_row["Location"],
            "final_score": score
        })

    recommendations[project["Project_ID"]] = {
        "project_name": project["Client_Name"],
        "required_skills": project["Required_Skills"],
        "recommended_employees": recommended_employees
    }

In [14]:
for proj_id, rec in recommendations.items():
    print(f"\nProject ID: {proj_id} | Client: {rec['project_name']}")
    print(f"Required Skills: {rec['required_skills']}")
    print("Recommended Employees:")
    for emp in rec["recommended_employees"]:
        print(f"  - {emp['employee_name']} ({emp['department']}, {emp['location']}) "
              f"| Skills: {emp['skills']} --> Final Score: {emp['final_score']}")


Project ID: P301 | Client: Alvarado, Anderson and Brown
Required Skills: Data Privacy Compliance, AWS, NLP
Recommended Employees:
  - William Smith (AI Research, Chennai) | Skills: Data Privacy Compliance, AWS, Data Warehousing --> Final Score: 0.446
  - Shawn Jones (Cybersecurity, Delhi) | Skills: Data Privacy Compliance, Cloud Security, Docker --> Final Score: 0.38
  - Megan Davis (Cybersecurity, Delhi) | Skills: Microservices, Data Privacy Compliance, Quantum Computing --> Final Score: 0.373

Project ID: P302 | Client: Cruz-Stokes
Required Skills: Microservices Architecture, Data Engineering, CI/CD Pipelines
Recommended Employees:
  - Brian Terry (Cybersecurity, Chennai) | Skills: Data Mesh Architecture, Data Quality Management, CI/CD Pipelines --> Final Score: 0.442
  - Scott Adams (Cloud Engineering, Chennai) | Skills: CI/CD, Data Engineering, Data Warehousing --> Final Score: 0.439
  - Eugene Hayes (Cloud Engineering, Mumbai) | Skills: CI/CD Pipelines, AWS, Data Engineering --> 


=== Evaluation Metrics ===
Precision@3: 0
Recall@3: 0
