In [1]:
# installing faker
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/1.9 MB 985.5 kB/s eta 0:00:02
   ---------- ----------------------------- 0.5/1.9 MB 985.5 kB/s eta 0:00:02
   ---------------- ----------------------- 0.8/1.9 MB 762.0 kB/s eta 0:00:02
   --------------------- ------------------ 1.0/1.9 MB 868.0 kB/s eta 0:00:02
   --------------------------- ------------ 1.3/1.9 MB 919.0 kB/s eta 0:00:01
   -------------------------------- ------- 1.6/1.9 MB 975.2 kB/s eta 0:00:01
   -------------------------------------- - 1.8/1.9 MB 1.0 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 1.0 MB/s eta 0:00:00
Installing collected packages: faker
Success



In [5]:
# importing packages
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import random
import numpy as np
import tensorflow as tf

random.seed(1693)
np.random.seed(1693)
tf.random.set_seed(1693)

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
#from keras.utils import to_categorical

import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split

In [7]:
import random
import csv
from faker import Faker

fake = Faker()

# tech_majors list
TECH_MAJORS = [
    "Computer Science", "Data Science", "Business Analytics", "Mathematics",
    "Statistics", "Information Systems", "Electrical Engineering", "Engineering"
]

# non_tech_majors list
NON_TECH_MAJORS = [
    "Art History", "Theater", "Dance", "Philosophy", "Religious Studies",
    "English Literature", "Fine Arts", "Music", "Culinary Arts", "Fashion Design", "Sports Recreation Management"
]

# degrees list
DEGREES = [
    "Bachelor of Science", "Bachelor of Arts", "Bachelor of Fine Arts",
    "Bachelor of Engineering", "Bachelor of Business Administration"
]

# good universities list
UNIVERSITIES_GOOD = [
    "UC Berkeley", "Carnegie Mellon University", "University of Michigan",
    "Georgia Tech", "Columbia University", "NYU", "Stanford University", "The College of William and Mary", "The University of Virginia", "James Madison University",
    "Virginia Tech", "Longwood", "Virginia Commonwealth University", "Hampton Sydney", "Old Dominion University"
]

# weak universities list
UNIVERSITIES_WEAK = [
    "Fake University1", "Fake University2",
    "Fake University3", "Fake University4", "Fake University5", "Fake University6", "Fake University7", "Fake University8", "Fake University9", "Fake University10",
    "Fake University11", "Fake University12", "Fake University13", "Fake University14", "Fake University15"
]

# strong expereience
STRONG_EXPERIENCE = [
    "Data Analyst Intern", "Software Developer", "Research Assistant",
    "Business Analyst", "Machine Learning Intern"
]

# weak expereience
WEAK_EXPERIENCE = [
    "Barista", "Retail Associate", "Fitness Trainer", "Dog Walker",
    "Receptionist", "Cashier", "Camp Counselor", "Pizza Delivery Driver"
]

# strong skills
STRONG_SKILLS = [
    "Python", "SQL", "Pandas", "Tableau", "Machine Learning", "Data Cleaning", "TensorFlow", "Scikit-learn"
]

# weak skills
WEAK_SKILLS = [
    "Public Speaking", "Teamwork", "Time Management", "Creativity", "Social Media", "Blogging", "Drawing"
]

# certifications
CERTIFICATIONS = [
    "Google Data Analytics", "AWS Cloud Practitioner", "None", "None", "None"
]

# extracurriculars
EXTRACURRICULARS = [
    "Hackathon Participant", "Research Club", "Volunteered at Animal Shelter",
    "None", "None"
]


# building the dataframe
def generate_resume_row():
    underqualified = random.random() < 0.75

    # Randomly override values to create mismatches
    mix_signal = random.random() < 0.4  # 40% of resumes have conflicting signals

    # randomly sampling from majors lists to get major
    major = random.choice(TECH_MAJORS if not underqualified else NON_TECH_MAJORS)
    # randomly sampling from universities lists to get university
    university = random.choice(UNIVERSITIES_GOOD if not underqualified else UNIVERSITIES_WEAK)
    # using random.uniform to get random gpas
    gpa = round(random.uniform(3.5, 4.0), 2) if not underqualified else round(random.uniform(2.2, 3.2), 2)
    # randombly sampling from expereinces lists to get job_title
    job_title = random.choice(STRONG_EXPERIENCE if not underqualified else WEAK_EXPERIENCE)
    #randomly sampling from the skills lists to get skills
    skills = random.sample(STRONG_SKILLS if not underqualified else WEAK_SKILLS, k=random.randint(3, 6))

    # creating noise in the dataframe variables so that underqualfied vs qualfied candidates have a mix of both attributes
    if mix_signal:
        # Randomly flip one or two attributes to make it noisy
        flip_attr = random.sample(["major", "university", "gpa", "job", "skills"], k=random.randint(1, 2))
        for attr in flip_attr:
            if attr == "major":
                major = random.choice(NON_TECH_MAJORS if not underqualified else TECH_MAJORS)
            elif attr == "university":
                university = random.choice(UNIVERSITIES_WEAK if not underqualified else UNIVERSITIES_GOOD)
            elif attr == "gpa":
                gpa = round(random.uniform(2.2, 3.2), 2) if not underqualified else round(random.uniform(3.5, 4.0), 2)
            elif attr == "job":
                job_title = random.choice(WEAK_EXPERIENCE if not underqualified else STRONG_EXPERIENCE)
            elif attr == "skills":
                skills = random.sample(WEAK_SKILLS if not underqualified else STRONG_SKILLS, k=random.randint(3, 6))
    # the return statement returns these variables
    return {
        "full_name": fake.name(),
        "email": fake.email(),
        "phone": fake.phone_number(),
        "location": f"{fake.city()}, {fake.state_abbr()}",
        "degree": random.choice(DEGREES),
        "major": major,
        "university": university,
        "gpa": gpa,
        "graduation_year": random.randint(2021, 2025),
        "job_title": job_title,
        "company": fake.company(),
        "start_date": f"{random.randint(2020, 2023)}-{random.randint(1,12):02d}",
        "end_date": f"{random.randint(2023, 2025)}-{random.randint(1,12):02d}",
        "skills": ", ".join(skills),
        "certifications": ", ".join(random.sample(CERTIFICATIONS, k=random.randint(0, 2))),
        "extracurriculars": ", ".join(random.sample(EXTRACURRICULARS, k=random.randint(1, 3))),
        "qualified": "No" if underqualified else "Yes"
    }

# defining the generate_resume_csv function that creates fake_resumes.csv
def generate_resume_csv(filename="fake_resumes.csv", num=500):
    data = [generate_resume_row() for _ in range(num)]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)
    print(f"✅ {num} fake resumes saved to '{filename}'")

# using the function defined above to create a sample 500 fake resumes
if __name__ == "__main__":
    generate_resume_csv("fake_resumes.csv", num=500)


✅ 500 fake resumes saved to 'fake_resumes.csv'


In [9]:
# reading the csv into a dataframe called data
data = pd.read_csv("fake_resumes.csv")

# getting the head of the dataframe
data.head()

Unnamed: 0,full_name,email,phone,location,degree,major,university,gpa,graduation_year,job_title,company,start_date,end_date,skills,certifications,extracurriculars,qualified
0,Marc Skinner,adam53@example.org,001-555-649-5086x3518,"Lake Lindabury, OR",Bachelor of Business Administration,Mathematics,Fake University12,3.98,2025,Camp Counselor,Nelson Ltd,2020-05,2024-11,"Scikit-learn, Pandas, Machine Learning, SQL, D...","None, None","Hackathon Participant, None",Yes
1,James Taylor,kaylee16@example.org,+1-803-584-1534x0942,"Chanport, NJ",Bachelor of Science,Information Systems,Old Dominion University,3.74,2021,Software Developer,Phillips PLC,2022-05,2023-01,"SQL, Scikit-learn, TensorFlow, Data Cleaning, ...","Google Data Analytics, None","Hackathon Participant, Research Club, None",Yes
2,Eric Johnson,gaylinda@example.net,001-611-577-2318x6881,"East Susanfurt, ND",Bachelor of Engineering,Music,Stanford University,3.88,2023,Research Assistant,"Banks, Thompson and James",2022-11,2023-05,"Pandas, Python, Scikit-learn, Tableau",,,Yes
3,Jessica Luna,elewis@example.org,436-646-0618,"South Taylorbury, DC",Bachelor of Arts,Statistics,Fake University1,3.84,2022,Camp Counselor,Walls Inc,2022-09,2024-10,"Drawing, Teamwork, Time Management, Blogging, ...",,,No
4,Dr. Karen Roman,mathisrobert@example.org,416.264.1327,"North Hayden, FM",Bachelor of Business Administration,Sports Recreation Management,Fake University12,2.22,2023,Camp Counselor,Duarte Group,2020-08,2025-07,"Social Media, Blogging, Time Management, Publi...","AWS Cloud Practitioner, None",,No


In [11]:
# printing the value counts of the qualfied variable to see class disribution
print(data['qualified'].value_counts())

qualified
No     363
Yes    137
Name: count, dtype: int64


In [13]:
# seeing the info and data types
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   full_name         500 non-null    object 
 1   email             500 non-null    object 
 2   phone             500 non-null    object 
 3   location          500 non-null    object 
 4   degree            500 non-null    object 
 5   major             500 non-null    object 
 6   university        500 non-null    object 
 7   gpa               500 non-null    float64
 8   graduation_year   500 non-null    int64  
 9   job_title         500 non-null    object 
 10  company           500 non-null    object 
 11  start_date        500 non-null    object 
 12  end_date          500 non-null    object 
 13  skills            500 non-null    object 
 14  certifications    223 non-null    object 
 15  extracurriculars  441 non-null    object 
 16  qualified         500 non-null    object 
dt

In [15]:
# defining the x variable, which will include, degree, major, university, gpa, job title, skills, certifications, and extracurriculars from data
x = data[['major', 'university', 'gpa', 'job_title', 'skills', 'certifications', 'extracurriculars']]

# defining the y variable, which will include the underqualified_flag from data
y = data[['qualified']]

In [17]:
# viewing the x dataframe
x.head()

Unnamed: 0,major,university,gpa,job_title,skills,certifications,extracurriculars
0,Mathematics,Fake University12,3.98,Camp Counselor,"Scikit-learn, Pandas, Machine Learning, SQL, D...","None, None","Hackathon Participant, None"
1,Information Systems,Old Dominion University,3.74,Software Developer,"SQL, Scikit-learn, TensorFlow, Data Cleaning, ...","Google Data Analytics, None","Hackathon Participant, Research Club, None"
2,Music,Stanford University,3.88,Research Assistant,"Pandas, Python, Scikit-learn, Tableau",,
3,Statistics,Fake University1,3.84,Camp Counselor,"Drawing, Teamwork, Time Management, Blogging, ...",,
4,Sports Recreation Management,Fake University12,2.22,Camp Counselor,"Social Media, Blogging, Time Management, Publi...","AWS Cloud Practitioner, None",


In [19]:
# viewing the y dataframe
y.head()

Unnamed: 0,qualified
0,Yes
1,Yes
2,Yes
3,No
4,No


In [21]:
# seeing the x data types
print(x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   major             500 non-null    object 
 1   university        500 non-null    object 
 2   gpa               500 non-null    float64
 3   job_title         500 non-null    object 
 4   skills            500 non-null    object 
 5   certifications    223 non-null    object 
 6   extracurriculars  441 non-null    object 
dtypes: float64(1), object(6)
memory usage: 27.5+ KB
None


In [23]:
# one hot encoding the categorical variables for the x dataframe
x1hot = pd.get_dummies(x, columns=['major', 'university', 'job_title', 'skills', 'certifications', 'extracurriculars'])

#printing the shape of x1hot
print(x1hot.shape)

(500, 602)


In [25]:
# seeing the xy data type
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   qualified  500 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB
None


In [27]:
# one hot encoding the categorical variable in the y dataframe
y1hot = pd.get_dummies(y, columns=['qualified'])
#printing the shape of y1hot
print(y1hot.shape)

(500, 2)


In [29]:
# viewing the y1hot
y1hot.head()

Unnamed: 0,qualified_No,qualified_Yes
0,False,True
1,False,True
2,False,True
3,True,False
4,True,False


In [31]:
# splitting into train and test

X_train, X_test, Y_train, Y_test = train_test_split(x1hot, y1hot, test_size = 0.3, random_state = 1693)

In [33]:
# getting the shape of X_train for input size
print(X_train.shape)
# getting the shape of Y_train for output size
print(Y_train.shape)

(350, 602)
(350, 2)


In [35]:
# creating the model
model = Sequential()

# adding a dense input layer with 15 units, Y_train[1] as the input dim, and relu activation
model.add(Dense(units = 15,
                input_dim = X_train.shape[1],
                activation = 'relu'))

# adding a dese layer with 8 units, and relu activation
model.add(Dense(units = 8,
                activation = 'relu'))

# adding a dense output layer with 2 units, and softmax activation
model.add(Dense(units = 2,
                activation = 'softmax'))

# printing the model summary
print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [37]:
# compiling the model
model.compile(loss = 'BinaryCrossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [39]:
# getting estimates
model.fit(X_train, Y_train, epochs = 100, verbose = 0)

<keras.src.callbacks.history.History at 0x1f6233abb90>

In [41]:
# evaluating the model accuracy
model.evaluate(X_test, Y_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8785 - loss: 0.2680  


[0.24643567204475403, 0.8933333158493042]