# Linking the dataset

In [38]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the CSV file
file_path = '/content/drive/MyDrive/CleanDatasets/merged_data.csv'

# Read the CSV file into a pandas DataFrame
try:
    df = pd.read_csv(file_path)
    print("DataFrame loaded successfully:")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DataFrame loaded successfully:


Unnamed: 0,source,Job Title,Job Description,Company Name,Location,Industry,Sector,Rating
0,business_analyst,Business Analyst - Clinical & Logistics Platform,Company Overview At Memorial Sloan Kettering (...,Memorial Sloan-Kettering,"New York, NY",Health Care Services & Hospitals,Health Care,3.9
1,business_analyst,Business Analyst,We are seeking for an energetic and collaborat...,Paine Schwartz Partners,"New York, NY",Venture Capital & Private Equity,Finance,3.8
2,business_analyst,Data Analyst,"For more than a decade, Asembia has been worki...",Asembia,"Florham Park, NJ",Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,3.6
3,business_analyst,"Information Security Analyst, Incident Response",Job Description Summary The Information Securi...,BD,"Franklin Lakes, NJ",Health Care Products Manufacturing,Manufacturing,3.6
4,business_analyst,Analyst - FP&A Global Revenue,Magnite is the world's largest independent sel...,Rubicon Project,"New York, NY",Internet,Information Technology,3.4


# Skills dictionary

In [39]:
ROLE_SKILLS = {
    "business_analyst": {
        "hard": [
            "excel", "sql", "tableau", "power bi", "sas", "r", "statistics",
            "financial modeling", "data visualization", "reporting"
        ],
        "domain": [
            "finance", "banking", "insurance", "healthcare", "supply chain",
            "marketing", "operations"
        ],
        "soft": [
            "communication", "stakeholder management", "problem solving",
            "analytical thinking", "documentation"
        ]
    },
    "data_analyst": {
        "hard": [
            "sql", "python", "pandas", "numpy", "excel", "tableau", "power bi",
            "looker", "r", "statistics", "data cleaning", "etl"
        ],
        "domain": [
            "retail", "e-commerce", "marketing", "healthcare", "operations"
        ],
        "soft": [
            "attention to detail", "visualization storytelling",
            "collaboration", "critical thinking"
        ]
    },
    "data_engineer": {
        "hard": [
            "python", "sql", "java", "scala", "spark", "hadoop", "kafka",
            "airflow", "dbt", "snowflake", "redshift", "bigquery",
            "aws", "azure", "gcp", "docker", "kubernetes", "etl",
            "data warehouse", "pipelines"
        ],
        "domain": [
            "cloud", "streaming", "data pipelines", "big data",
            "healthcare", "finance"
        ],
        "soft": [
            "problem solving", "teamwork", "ownership", "documentation"
        ]
    },
    "data_scientist": {
        "hard": [
            "python", "r", "sql", "machine learning", "deep learning",
            "nlp", "tensorflow", "pytorch", "scikit-learn",
            "statistics", "probability", "optimization", "time series",
            "computer vision"
        ],
        "domain": [
            "ai", "healthcare", "finance", "retail", "predictive analytics",
            "genai"
        ],
        "soft": [
            "critical thinking", "research mindset", "communication",
            "problem solving"
        ]
    }
}



# Extract Skills from Job Descriptions

In [40]:
import re

def extract_skills(job_desc, role):
    job_desc_lower = job_desc.lower()
    found_skills = {
        "hard": [],
        "domain": [],
        "soft": []
    }

    for category, skills in ROLE_SKILLS[role].items():
        for skill in skills:
            # simple substring search (you can improve with regex / NLP later)
            if re.search(r"\b" + re.escape(skill.lower()) + r"\b", job_desc_lower):
                found_skills[category].append(skill)

    return found_skills


In [41]:
row = df.iloc[0]  # first job posting
skills = extract_skills(row["Job Description"], row["source"])
print(skills)


{'hard': [], 'domain': ['insurance', 'healthcare'], 'soft': ['communication', 'documentation']}


# Add Extracted Skills into Dataset

In [42]:
df["Extracted Skills"] = df.apply(
    lambda row: extract_skills(row["Job Description"], row["source"]), axis=1
)


# Recommendation Part

## Sample Candidate Profile

In [43]:
candidates = [
    {
        "name": "Suga",
        "skills": ["excel", "sql", "tableau"],
        "domain": ["finance", "banking"],
        "soft": ["communication", "stakeholder management"],
        "location": "New York, NY",
        "role_interest": ["business analyst","data analyst"]
    },
    {
        "name": "Priya",
        "skills": ["python", "sql", "pandas", "numpy"],
        "domain": ["healthcare"],
        "soft": ["attention to detail", "critical thinking"],
        "location": "San Francisco, CA",
        "role_interest": ["data analyst", "data scientist"]
    },
    {
        "name": "Loga",
        "skills": ["python", "java", "spark", "hadoop", "airflow", "aws"],
        "domain": ["big data", "cloud"],
        "soft": ["teamwork", "problem solving"],
        "location": "Chicago, IL",
        "role_interest": ["data engineer"]
    },
    {
        "name": "Dhanusha",
        "skills": ["python", "r", "sql", "machine learning", "tensorflow"],
        "domain": ["ai", "healthcare"],
        "soft": ["critical thinking", "research mindset"],
        "location": "Boston, MA",
        "role_interest": ["data scientist"]
    },
    {
        "name": "Ashish",
        "skills": ["excel", "power bi", "sql"],
        "domain": ["marketing", "operations"],
        "soft": ["analytical thinking", "collaboration"],
        "location": "Remote",
        "role_interest": ["business analyst", "data analyst"]
    }
]



## Define a Scoring Logic



You need to assign weights (because hard skills are usually more important than soft skills).

Example weighting system:

Hard skill match = +3 points

Domain match = +2 points

Soft skill match = +1 point

Location match = +2 points (exact city), +1 (same state/country), +0.5 (remote)

Role interest match = +3 points

Bonus: Company rating ≥4.0 → +1 point

## Matching Function

In [44]:
def calculate_match_score(job_row, candidate, extracted_skills):
    score = 0

    # Skills overlap
    hard_overlap = set(candidate["skills"]) & set(extracted_skills["hard"])
    domain_overlap = set(candidate["domain"]) & set(extracted_skills["domain"])
    soft_overlap = set(candidate["soft"]) & set(extracted_skills["soft"])

    score += 3 * len(hard_overlap)
    score += 2 * len(domain_overlap)
    score += 1 * len(soft_overlap)

    # Location match
    if candidate["location"].lower() in str(job_row["Location"]).lower():
        score += 2
    elif "remote" in str(job_row["Location"]).lower():
        score += 0.5

    # Role interest match
    for role in candidate["role_interest"]:
        if role.lower() in job_row["Job Title"].lower():
            score += 3

    # Company rating bonus
    try:
        if float(job_row["Rating"]) >= 4.0:
            score += 1
    except:
        pass

    return score


## Apply to Your Dataset

In [45]:


# Then calculate score for each job vs candidate
df["Match Score"] = df.apply(
    lambda row: calculate_match_score(row, candidate, row["Extracted Skills"]), axis=1
)

# Get top 5 recommendations
top_matches = df.sort_values("Match Score", ascending=False).head(10)
top_matches[["Job Title", "Company Name", "Location", "Match Score"]]


Unnamed: 0,Job Title,Company Name,Location,Match Score
5533,"Healthcare Data Analyst, Qsight",Guidepoint,"New York, NY",19.0
5899,Data Analyst,Signal Sciences,"Los Angeles, CA",18.0
8217,Healthcare Data Analyst,Cricket Health,"Cambridge, MA",18.0
6942,Data Analyst,CyberCoders,"Columbus, OH",18.0
7923,Data Analyst,Gravy Analytics,"Sterling, VA",18.0
6747,Data Analyst,CyberCoders,"Redwood City, CA",18.0
7771,Data Analyst,CyberCoders,"Seattle, WA",18.0
6359,Data Analyst,University of Pennsylvania,"Philadelphia, PA",18.0
9151,Data Analyst,Signal Sciences,"Los Angeles, CA",18.0
7791,Senior Data Analyst,Adaptive Biotechnologies,"Seattle, WA",18.0


In [46]:
results = {}

for cand in candidates:
    df[f"{cand['name']}_score"] = df.apply(
        lambda row: calculate_match_score(row, cand, row["Extracted Skills"]), axis=1
    )
    results[cand["name"]] = df[["Job Title", "Company Name", "Location", f"{cand['name']}_score"]].sort_values(
        f"{cand['name']}_score", ascending=False
    ).head(5)  # top 3 for each candidate


# Example: view Aarav’s top matches
results["Suga"]



Unnamed: 0,Job Title,Company Name,Location,Suga_score
69,Business Analyst,Delv Global,"New York, NY",17.0
1564,Marketing Business Analyst,Freedom Mortgage,"Marlton, NJ",16.0
1341,Senior Data Analyst,Discount Tire,"Scottsdale, AZ",16.0
1927,Senior Data Analyst,Scaleworks,"San Antonio, TX",16.0
2080,Financial Business Analyst,Axos Bank,"San Diego, CA",16.0
