In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
_dataset_path = '/content/drive/MyDrive/Celerates/Notebook/Final Project/datasets'

### **PRE-PROCESSING DATASET**

In [15]:
import pandas as pd
import numpy as np

In [16]:
# downloaded manually from -> https://huggingface.co/datasets/AzharAli05/Resume-Screening-Dataset

df = pd.read_csv(f'{_dataset_path}/acceptance_dataset.csv')
df.head(10)

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...
5,Mobile App Developer,"Here's a sample resume for Jose Hall, a skille...",reject,No experience in back-end development.,We need a Mobile App Developer to enhance our ...
6,UX Designer,Here's a professional resume for Rachael Newma...,reject,Insufficient system design expertise for senio...,Help us build the next-generation products as ...
7,Cloud Engineer,"Here's a sample resume for Jessica Hall, a can...",reject,Needs improvement in machine learning algorithms.,We're seeking a talented Cloud Engineer to wor...
8,Digital Marketing Specialist,Here's a sample resume for Jonathan Powers:\n\...,select,Impressive leadership and communication abilit...,"As a Digital Marketing Specialist, you'll lead..."
9,AI Researcher,Here's a sample resume for Zachary Ward:\n\nZa...,select,Excellent full-stack development experience.,If you're passionate about software engineerin...


In [17]:
print(f'describe df:\n{df.describe()}', end='\n\n')
print(f'null values:\n{df.isna().sum()}', end='\n\n')
print(f'duplicated values:\n{df.duplicated().sum()}', end='\n\n')
print(f'col values:\n{df.columns}', end='\n\n')
print(f'unique value count:\n{df["Decision"].value_counts()}')

describe df:
                  Role                                             Resume  \
count            10174                                              10174   
unique              45                                              10174   
top     Data Scientist  Here's a sample resume for Charlie Miller, a P...   
freq               538                                                  1   

       Decision                                Reason_for_decision  \
count     10174                                              10174   
unique        2                                                539   
top      reject  Insufficient system design expertise for senio...   
freq       5114                                                730   

                                          Job_Description  
count                                               10174  
unique                                               3446  
top     Join our team as a Product Manager and leverag...  
freq      

In [18]:
# col names
col = df.columns.tolist()
print(col)

['Role', 'Resume', 'Decision', 'Reason_for_decision', 'Job_Description']


### **Removing Stopwords**

In [19]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
stop = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = text.lower()
    text = " ".join([w for w in text.split() if w not in stop])
    return text

df["resume_clean"] = df["Resume"].apply(clean_text)

### **Labeling Data**

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
'''
label mapping:
    0: rejected
    1: selected (not accepted)
'''

le = LabelEncoder()
df['label'] = le.fit_transform(df['Decision'])
num_classes = df["label"].nunique()

df.head(10)

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description,resume_clean,label
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...,professional resume jason jones jason jones e ...,0
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...,professional resume ann marshall ann marshall ...,1
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...,professional resume patrick mcclain patrick mc...,0
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...,professional resume patricia gray patricia gra...,1
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...,professional resume amanda gross amanda gross ...,0
5,Mobile App Developer,"Here's a sample resume for Jose Hall, a skille...",reject,No experience in back-end development.,We need a Mobile App Developer to enhance our ...,sample resume jose hall skilled mobile app dev...,0
6,UX Designer,Here's a professional resume for Rachael Newma...,reject,Insufficient system design expertise for senio...,Help us build the next-generation products as ...,professional resume rachael newman applying ro...,0
7,Cloud Engineer,"Here's a sample resume for Jessica Hall, a can...",reject,Needs improvement in machine learning algorithms.,We're seeking a talented Cloud Engineer to wor...,sample resume jessica hall candidate applying ...,0
8,Digital Marketing Specialist,Here's a sample resume for Jonathan Powers:\n\...,select,Impressive leadership and communication abilit...,"As a Digital Marketing Specialist, you'll lead...",sample resume jonathan powers jonathan powers ...,1
9,AI Researcher,Here's a sample resume for Zachary Ward:\n\nZa...,select,Excellent full-stack development experience.,If you're passionate about software engineerin...,sample resume zachary ward zachary ward ai res...,1


### **Train Splitting**

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(
    df['resume_clean'],
    df['label'],
    test_size=0.2,
    random_state=12
    )

### **Tokenization**

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
_max_words = 20000
_max_len = 512

tokenizer = Tokenizer(num_words=_max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=_max_len, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=_max_len, padding='post')

### **Build Model**

In [49]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [33]:
# add weigth
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))
print(class_weights)

{0: np.float64(0.9979156449239823), 1: np.float64(1.002093080522039)}


In [50]:
CNN_model = Sequential([
    Embedding(input_dim=_max_words, output_dim=128, input_length=_max_len),
    Conv1D(64, 5, activation='relu', kernel_regularizer=l2(0.001)),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

CNN_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

CNN_model.summary()

### **Train Model**

In [29]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [45]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=0.00001
)

In [52]:

CNN_history = CNN_model.fit(
    x_train_pad,
    y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    class_weight=class_weights
)

Epoch 1/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 148ms/step - accuracy: 0.5436 - loss: 0.7656 - val_accuracy: 0.5577 - val_loss: 0.6866 - learning_rate: 0.0010
Epoch 2/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 143ms/step - accuracy: 0.6037 - loss: 0.6690 - val_accuracy: 0.5682 - val_loss: 0.6703 - learning_rate: 0.0010
Epoch 3/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 141ms/step - accuracy: 0.8696 - loss: 0.4287 - val_accuracy: 0.5706 - val_loss: 0.8023 - learning_rate: 0.0010
Epoch 4/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 137ms/step - accuracy: 0.9727 - loss: 0.1464 - val_accuracy: 0.5768 - val_loss: 0.9013 - learning_rate: 0.0010
Epoch 5/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 139ms/step - accuracy: 0.9897 - loss: 0.0885 - val_accuracy: 0.5694 - val_loss: 0.8940 - learning_rate: 5.0000e-04


In [53]:
CNN_loss, CNN_acc = CNN_model.evaluate(x_test_pad, y_test)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.5472 - loss: 0.6876


In [54]:
print(f'acc: {CNN_acc}')
print(f'loss: {CNN_loss}')

acc: 0.5449631214141846
loss: 0.6806753873825073
