In [10]:
import pandas as pd
import numpy as np

# For reproducibility
np.random.seed(42)

# === 1. Load your main resume dataset ===
# Make sure this filename matches your actual CSV name/path
resumes = pd.read_csv("AI_Resume_Screening.csv")

# Columns you told me:
# Resume_ID, Name, Skills, Experience (Years), Education, Certifications,
# Job Role, Recruiter Decision, Salary Expectation ($), Projects Count, AI Score (0-100)

# === 2. Load your gender-by-name reference dataset ===
# Again, adjust filename if needed
gender_ref = pd.read_csv("gender_by_name.csv")

# Columns in this dataset:
# Name, Gender, Count, Probability

resumes.head()


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [12]:
# --- Extract first and last names from full Name ---

# First name: first token
resumes["First_Name"] = resumes["Name"].str.strip().str.split().str[0]

# Last name: last token (can be same as first if only one part)
resumes["Last_Name"] = resumes["Name"].str.strip().str.split().str[-1]

# Lowercase versions for matching
resumes["First_Name_Lower"] = resumes["First_Name"].str.lower()
resumes["Last_Name_Lower"] = resumes["Last_Name"].str.lower()

# --- Normalize the gender reference dataset ---

gender_ref["Name_Lower"] = gender_ref["Name"].str.strip().str.lower()

# Standardize Gender labels (M/F or Male/Female → "Male"/"Female")
def normalize_gender_label(g):
    if pd.isna(g):
        return None
    g = str(g).strip().lower()
    if g in ["m", "male"]:
        return "Male"
    if g in ["f", "female"]:
        return "Female"
    return None

gender_ref["Gender_Norm"] = gender_ref["Gender"].apply(normalize_gender_label)

# Ensure Probability is between 0 and 1
if "Probability" in gender_ref.columns:
    # If Probability looks like 0–100 instead of 0–1, rescale
    if gender_ref["Probability"].max() > 1.0:
        gender_ref["Prob_Norm"] = gender_ref["Probability"] / 100.0
    else:
        gender_ref["Prob_Norm"] = gender_ref["Probability"]
else:
    gender_ref["Prob_Norm"] = None

gender_ref.head()


Unnamed: 0,Name,Gender,Count,Probability,Name_Lower,Gender_Norm,Prob_Norm
0,James,M,5304407,0.014517,james,Male,0.014517
1,John,M,5260831,0.014398,john,Male,0.014398
2,Robert,M,4970386,0.013603,robert,Male,0.013603
3,Michael,M,4579950,0.012534,michael,Male,0.012534
4,William,M,4226608,0.011567,william,Male,0.011567


In [14]:
def infer_gender(first_name_lower: str) -> str:
    """
    Infer gender using:
    1. gender_by_name reference dataset (probabilistic)
    2. fallback heuristic if not found
    """
    # 1) Try to use the gender reference dataset
    matches = gender_ref[gender_ref["Name_Lower"] == first_name_lower]
    
    if len(matches) > 0:
        # If multiple rows exist for the same name, pick the most frequent (highest Count)
        row = matches.sort_values(by="Count", ascending=False).iloc[0]
        g = row["Gender_Norm"]
        p = row["Prob_Norm"]
        
        if g in ["Male", "Female"]:
            # If probability is valid, sample probabilistically
            if pd.notna(p):
                p = float(p)
                p = max(min(p, 1.0), 0.0)  # clamp to [0, 1]
                if g == "Male":
                    return np.random.choice(["Male", "Female"], p=[p, 1 - p])
                else:  # g == "Female"
                    return np.random.choice(["Female", "Male"], p=[p, 1 - p])
            else:
                # No probability — deterministic fallback
                return g
    
    # 2) Fallback heuristic if name not present in reference file or invalid row
    fn = first_name_lower
    female_endings = ["a", "i", "y", "ette", "ine", "na", "la"]
    
    if any(fn.endswith(end) for end in female_endings):
        # Mostly female, with a bit of uncertainty
        return np.random.choice(["Female", "Male"], p=[0.8, 0.2])
    
    # Slightly more males overall in fallback
    return np.random.choice(["Male", "Female"], p=[0.52, 0.48])

# Apply to your resumes
resumes["Gender"] = resumes["First_Name_Lower"].apply(infer_gender)

resumes[["Name", "Gender"]].head()


Unnamed: 0,Name,Gender
0,Ashley Ali,Male
1,Wesley Roman,Female
2,Corey Sanchez,Female
3,Elizabeth Carney,Male
4,Julie Hill,Male


In [16]:
# Synthetic surname-based race probabilities (you can expand this)
surname_race_reference = {
    # Hispanic-leaning surnames
    "garcia":    {"White":0.05, "Black":0.01, "Asian":0.01, "Hispanic":0.90, "Native American":0.01, "Other":0.02},
    "rodriguez": {"White":0.05, "Black":0.01, "Asian":0.01, "Hispanic":0.88, "Native American":0.01, "Other":0.04},
    "sanchez":   {"White":0.04, "Black":0.01, "Asian":0.01, "Hispanic":0.90, "Native American":0.01, "Other":0.03},

    # South Asian
    "patel": {"White":0.02, "Black":0.01, "Asian":0.94, "Hispanic":0.01, "Native American":0.00, "Other":0.02},
    "khan":  {"White":0.02, "Black":0.02, "Asian":0.90, "Hispanic":0.02, "Native American":0.00, "Other":0.04},
    "ali":   {"White":0.03, "Black":0.02, "Asian":0.88, "Hispanic":0.02, "Native American":0.00, "Other":0.05},

    # East Asian
    "kim":  {"White":0.02, "Black":0.01, "Asian":0.94, "Hispanic":0.01, "Native American":0.00, "Other":0.02},
    "chen": {"White":0.02, "Black":0.01, "Asian":0.94, "Hispanic":0.01, "Native American":0.00, "Other":0.02},
    "wang": {"White":0.02, "Black":0.01, "Asian":0.95, "Hispanic":0.01, "Native American":0.00, "Other":0.01},

    # African American–leaning
    "washington": {"White":0.15, "Black":0.78, "Asian":0.01, "Hispanic":0.02, "Native American":0.02, "Other":0.02},
    "jackson":    {"White":0.35, "Black":0.55, "Asian":0.01, "Hispanic":0.05, "Native American":0.02, "Other":0.02},

    # Common White/European surnames
    "smith":   {"White":0.70, "Black":0.25, "Asian":0.01, "Hispanic":0.02, "Native American":0.01, "Other":0.01},
    "johnson": {"White":0.65, "Black":0.30, "Asian":0.01, "Hispanic":0.02, "Native American":0.01, "Other":0.01},
    "miller":  {"White":0.80, "Black":0.10, "Asian":0.02, "Hispanic":0.05, "Native American":0.01, "Other":0.02},
}

# Fallback population distribution (approx US)
fallback_races = ["White", "Black", "Asian", "Hispanic", "Native American", "Other"]
fallback_probs = [0.60, 0.13, 0.06, 0.18, 0.01, 0.02]

def infer_race(last_name_lower: str) -> str:
    if last_name_lower in surname_race_reference:
        probs_dict = surname_race_reference[last_name_lower]
        races = list(probs_dict.keys())
        probs = list(probs_dict.values())
        return np.random.choice(races, p=probs)
    
    # Fallback: sample from national distribution
    return np.random.choice(fallback_races, p=fallback_probs)

# Apply to resumes
resumes["Race"] = resumes["Last_Name_Lower"].apply(infer_race)

resumes[["Name", "Race"]].head()


Unnamed: 0,Name,Race
0,Ashley Ali,Asian
1,Wesley Roman,White
2,Corey Sanchez,Hispanic
3,Elizabeth Carney,Asian
4,Julie Hill,Hispanic


In [18]:
def estimate_age(row) -> int:
    exp_years = float(row["Experience (Years)"])
    edu = str(row["Education"])
    
    # Base "start of career" age depending on education
    # You can tweak these if you want
    if "High School" in edu:
        base_age = 18
    elif "B." in edu:  # B.Sc, B.Tech, etc.
        base_age = 22
    elif "MBA" in edu:
        base_age = 25
    elif "M." in edu or "Master" in edu:
        base_age = 24
    elif "PhD" in edu or "Ph.D" in edu:
        base_age = 28
    else:
        base_age = 22  # default
    
    # Age ≈ base + experience + small noise
    noise = np.random.normal(loc=0, scale=2)
    age = base_age + exp_years + noise
    
    # Clip to [18, 65] and cast to int
    age = int(np.clip(round(age), 18, 65))
    return age

resumes["Age"] = resumes.apply(estimate_age, axis=1)

resumes[["Experience (Years)", "Education", "Age"]].head()


Unnamed: 0,Experience (Years),Education,Age
0,10,B.Sc,30
1,10,MBA,33
2,1,MBA,26
3,7,B.Tech,30
4,4,PhD,34


In [20]:
def assign_disability(age: int) -> str:
    if age < 30:
        p_yes = 0.05
    elif age < 40:
        p_yes = 0.10
    elif age < 50:
        p_yes = 0.15
    elif age < 60:
        p_yes = 0.20
    else:
        p_yes = 0.25
    
    return np.random.choice(["Yes", "No"], p=[p_yes, 1 - p_yes])

resumes["Disability_Status"] = resumes["Age"].apply(assign_disability)

resumes[["Age", "Disability_Status"]].head()


Unnamed: 0,Age,Disability_Status
0,30,No
1,33,No
2,26,No
3,30,No
4,34,No


In [22]:
# Optional: Drop helper columns
resumes_final = resumes.drop(columns=[
    "First_Name", "Last_Name", "First_Name_Lower", "Last_Name_Lower"
])

# Save to a new file
output_path = "AI_Resume_Screening_with_demographics.csv"
resumes_final.to_csv(output_path, index=False)

output_path


'AI_Resume_Screening_with_demographics.csv'

In [24]:
print("Gender distribution:")
print(resumes_final["Gender"].value_counts(normalize=True).round(3), "\n")

print("Race distribution:")
print(resumes_final["Race"].value_counts(normalize=True).round(3), "\n")

print("Age summary:")
print(resumes_final["Age"].describe(), "\n")

print("Disability_Status distribution:")
print(resumes_final["Disability_Status"].value_counts(normalize=True).round(3))


Gender distribution:
Gender
Male      0.503
Female    0.497
Name: proportion, dtype: float64 

Race distribution:
Race
White              0.566
Hispanic           0.177
Black              0.158
Asian              0.065
Other              0.025
Native American    0.009
Name: proportion, dtype: float64 

Age summary:
count    1000.000000
mean       29.082000
std         4.245857
min        18.000000
25%        26.000000
50%        29.000000
75%        32.000000
max        43.000000
Name: Age, dtype: float64 

Disability_Status distribution:
Disability_Status
No     0.923
Yes    0.077
Name: proportion, dtype: float64


In [29]:
sorted(resumes["Job Role"].unique())


['AI Researcher',
 'Cybersecurity Analyst',
 'Data Scientist',
 'Software Engineer']

In [33]:
resumes["Job Role"].value_counts().reset_index()


Unnamed: 0,Job Role,count
0,AI Researcher,257
1,Data Scientist,255
2,Cybersecurity Analyst,255
3,Software Engineer,233
