Extracted realistic skilss, education, countries to belong and experience from real data of Physician.

In [1]:
import pandas as pd
import re
import spacy
from spacy.matcher import PhraseMatcher


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df_physician = pd.read_csv("Physician_balanced_200.csv")
df_physician.head()

Unnamed: 0,hard_text,profession,gender
0,"In addition to English, he speaks Spanish. Dr....",physician,Male
1,"In his practice, he is particularly interested...",physician,Male
2,"Prior to this, he was a Consultant Anesthetist...",physician,Male
3,He has received a 4.0 out of 5 star rating by ...,physician,Male
4,He studied medicine at Philadelphia College of...,physician,Male


In [4]:
# Define keywords
skills = ['Cardiology','Pediatrics','Internal Medicine','Family Medicine','Dermatology','Neurology','Oncology','Emergency Medicine','Radiology','Endocrinology','Surgery (minor procedures)','Patient Diagnosis','Clinical Research','Electronic Medical Records (EMR)','Telemedicine','Prescribing Medications','Lab Test Interpretation']
education = ['MD', 'MBBS', 'DO', 'PhD', 'BSc', 'MSc']

In [5]:
def extract_experience(text):
    """
    Extracts years of experience from a text string
    """
    match = re.search(r'(\d+)\s+years?', text.lower())
    return int(match.group(1)) if match else None

In [6]:

# Create phrase matchers
skill_matcher = PhraseMatcher(nlp.vocab)
edu_matcher = PhraseMatcher(nlp.vocab)

skill_patterns = [nlp.make_doc(skill) for skill in skills]
edu_patterns = [nlp.make_doc(edu) for edu in education]

skill_matcher.add("SKILL", skill_patterns)
edu_matcher.add("EDU", edu_patterns)


In [7]:
# ---------- Extraction loop ---------
all_skills = []
all_countries = []
all_experience = []
all_education = []

for bio in df_physician['hard_text']:
    doc = nlp(bio)

    # countries
    countries = [ent.text for ent in doc.ents if ent.label_ == "GPE"]

    # skills
    skills_found = [doc[start:end].text for match_id, start, end in skill_matcher(doc)]
    edu_found = [doc[start:end].text for match_id, start, end in edu_matcher(doc)]


    # experience
    experience = extract_experience(bio)

    all_skills.append(list(set(skills_found)))
    all_education.append(list(set(edu_found)))
    all_countries.append(list(set(countries)))
    all_experience.append(experience)

# ---------- Add to dataframe ----------
df_physician['skills_extracted'] = all_skills
df_physician['education_extracted'] = all_education
df_physician['countries_extracted'] = all_countries
df_physician['experience_years'] = all_experience

# ---------- Save enhanced dataset ----------
df_physician.to_csv("physicians_extracted.csv", index=False)
print("‚úÖ Extraction complete. Saved as physicians_extracted.csv")

‚úÖ Extraction complete. Saved as physicians_extracted.csv


In [9]:
df = pd.read_csv("physicians_extracted.csv")
df

Unnamed: 0,hard_text,profession,gender,skills_extracted,education_extracted,countries_extracted,experience_years
0,"In addition to English, he speaks Spanish. Dr....",physician,Male,[],[],[],
1,"In his practice, he is particularly interested...",physician,Male,[],[],[],
2,"Prior to this, he was a Consultant Anesthetist...",physician,Male,[],[],"['Anesthesia', 'Obstetric', 'UK']",
3,He has received a 4.0 out of 5 star rating by ...,physician,Male,[],[],[],
4,He studied medicine at Philadelphia College of...,physician,Male,[],[],[],
...,...,...,...,...,...,...,...
195,"She practices in Annapolis, Maryland and has t...",physician,Female,[],[],"['Annapolis', 'Maryland']",
196,"Ms. Boone practices medicine in Mesa, AZ and 1...",physician,Female,[],[],[],
197,"Ms. Losado practices medicine in McAllen, TX a...",physician,Female,[],[],['McAllen'],
198,"Ms. Weisenborn practices medicine in Buffalo, ...",physician,Female,[],[],['Buffalo'],


In [10]:
# Flatten and get unique country names
all_unique_countries = set()

for row in df_physician['countries_extracted']:
    for c in row:  # each row is a list
        all_unique_countries.add(c)

print("All unique countries:\n", all_unique_countries)
print("Total:", len(all_unique_countries))


All unique countries:
 {'San Antonio', 'Hamlet', 'San Diego', 'Dickinson', 'Gulfport', 'New York-Presbyterian Hospital', 'NIAID', 'Bedok', 'Tempe', 'Salt Lake City', 'Trenton', 'Covina', 'Neck', 'Texas', 'Dzogchen', 'Edina', 'Biology', 'Pottstown', 'Jamnagar', 'Newark', 'VA', 'KY', 'New York-Presbyterian Hospital, Memorial Sloan-Kettering Cancer Center', 'Fort Mitchell', 'Greensboro', 'Purcellville', 'Pittsburgh', 'Buffalo', 'Pittsburg', 'Anesthesia', 'Naperville', 'Alexandria', 'Dubuque', 'NC', 'Birmingham', 'Vietnam', 'Richmond', 'MD', 'New Jersey', 'Taichung', 'St. Joseph‚Äôs Hospital', "St. Joseph's", 'Fresno', 'Asheville', 'Marysville', 'Minnesota', 'Santa Cruz', 'Bopal', 'Dublin', 'Rheumatology', 'Alaska', 'Syracuse', 'Louisville', 'Fairfield County', 'Nephrology', 'Redwood City', 'Massachusetts', 'Cincinnati', 'Los Angeles', 'Surgery', 'Dickson City', 'New Orleans', 'Saudi Arabia', 'Aberdeen', 'Hindi', 'Phoenix', 'Mansfield', 'Pediatrics', 'Colorado Springs', 'Obstetric', 'OH', 

In [11]:
from collections import Counter

country_counter = Counter()

for row in df_physician['countries_extracted']:
    for c in row:
        country_counter[c] += 1

print(country_counter.most_common())


[('NC', 9), ('MD', 8), ('Texas', 5), ('OH', 5), ('Arizona', 4), ('Charlotte', 4), ('IA', 4), ('New York', 3), ('California', 2), ('San Francisco', 2), ('Los Angeles', 2), ('Idaho', 2), ('Seattle', 2), ('Pocatello', 2), ('Tempe', 2), ('Hindi', 2), ('India', 2), ('New Jersey', 2), ('Syracuse', 2), ('Colorado', 2), ('Massachusetts', 2), ('New York-Presbyterian Hospital', 2), ('Oklahoma City', 2), ('Minnesota', 2), ('Edina', 2), ('Newark', 2), ('Pediatrics', 2), ('Illinois', 2), ('Maryland', 2), ('Colorado Springs', 2), ('Scottsdale', 2), ('Tagalog', 2), ('Anesthesia', 1), ('Obstetric', 1), ('UK', 1), ('San Diego', 1), ('Urdu', 1), ('Pune', 1), ('Missouri', 1), ('Chesterfield', 1), ('Missouri City', 1), ('Iowa', 1), ('St. Joseph‚Äôs Hospital', 1), ('Washington', 1), ('Phoenix', 1), ('Columbia', 1), ('Anchorage', 1), ('Alaska', 1), ('Munster', 1), ('Indiana', 1), ('Vietnam', 1), ('Neck', 1), ('St. Louis', 1), ('Baltimore', 1), ('Romeo', 1), ('Hamlet', 1), ('Bach', 1), ('Telemann', 1), ('Jul

Generation of Job Ad for Physician

In [None]:

from collections import Counter         # a special Python class from the collections module that makes it extremely easy to count the frequency of items in a list.
import random


def aggregate_keywords(df):
    all_skills = Counter()
    all_education = Counter()
    all_countries = Counter()
    experience_list = []

    for _, row in df.iterrows():
        # Skills
        for s in eval(row['skills_extracted']):
            all_skills[s] += 1
        
        # Education
        for e in eval(row['education_extracted']):
            all_education[e] += 1
        
        # Countries
        for c in eval(row['countries_extracted']):
            all_countries[c] += 1

        # Experience
        if not pd.isna(row['experience_years']):
            experience_list.append(int(row['experience_years']))

    top_skills = [s for s, _ in all_skills.most_common(5)]
    top_edu = [e for e, _ in all_education.most_common(2)]
    top_countries = [c for c, _ in all_countries.most_common(1)]
    avg_exp = int(sum(experience_list)/len(experience_list)) if experience_list else 5

    return top_skills, top_edu, top_countries, avg_exp

skills, edu, countries, exp = aggregate_keywords(df)
print("Skills:", skills)
print("Education:", edu)
print("Country:", countries)
print("Experience:", exp)


Skills: ['Family Medicine', 'Internal Medicine', 'Pediatrics', 'Dermatology', 'Oncology']
Education: ['MD', 'MBBS']
Country: ['NC']
Experience: 16


In [None]:
def generate_job_ad(skills, education, country, experience):

    job_ad = f"""
üè• Job Title: Physician ‚Äî General Medicine
üìç Location: {country}
üïí Employment Type: Full-time

About Us
Our hospital is dedicated to providing compassionate and patient-centered medical care. We believe in supporting our staff and creating an environment built on trust, respect, and collaboration.

Position Overview
We are seeking a committed Physician with approximately {experience}+ years of clinical experience. The ideal candidate will bring strong diagnostic abilities, excellent communication skills, and a genuine passion for patient care.

Key Responsibilities
- Conduct patient examinations, evaluations, and treatment planning
- Manage acute and chronic medical conditions
- Collaborate with fellow healthcare professionals and hospital staff
- Maintain accurate and timely EMR/EHR documentation
- Educate patients and families on treatment plans and preventive care

Required Qualifications
- Medical degree ({', '.join(education)})
- Board-eligible or board-certified
- Active state medical license (or eligibility to obtain)
- Strong clinical and interpersonal skills

Preferred Skills
- {', '.join(skills)}

What We Offer
- Supportive work environment
- Competitive salary and benefits
- Professional development and CME opportunities
- A mission-focused hospital culture that values teamwork and compassion

How to Apply
Interested applicants may submit their resume and a brief cover letter. We look forward to welcoming a new member to our care team.
"""
    return job_ad

final_job_ad = generate_job_ad(skills, edu, countries[0], exp)
print(final_job_ad)

output_filename = "physician_job_ad.txt"

with open(output_filename, "w", encoding="utf-8") as f:
    f.write(final_job_ad)


üè• Job Title: Physician ‚Äî General Medicine
üìç Location: NC
üïí Employment Type: Full-time

About Us
Our hospital is dedicated to providing compassionate and patient-centered medical care. We believe in supporting our staff and creating an environment built on trust, respect, and collaboration.

Position Overview
We are seeking a committed Physician with approximately 16+ years of clinical experience. The ideal candidate will bring strong diagnostic abilities, excellent communication skills, and a genuine passion for patient care.

Key Responsibilities
- Conduct patient examinations, evaluations, and treatment planning
- Manage acute and chronic medical conditions
- Collaborate with fellow healthcare professionals and hospital staff
- Maintain accurate and timely EMR/EHR documentation
- Educate patients and families on treatment plans and preventive care

Required Qualifications
- Medical degree (MD, MBBS)
- Board-eligible or board-certified
- Active state medical license (or el