In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

_dataset_path = '/content/drive/MyDrive/Celerates/Notebook/Final Project/datasets/acceptance_dataset.csv'

Mounted at /content/drive


### **PRE-PROCESSING DATASET**

In [3]:
import pandas as pd
import numpy as np

In [4]:
# downloaded manually from -> https://huggingface.co/datasets/AzharAli05/Resume-Screening-Dataset

df = pd.read_csv(_dataset_path)
df.head(10)

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...
5,Mobile App Developer,"Here's a sample resume for Jose Hall, a skille...",reject,No experience in back-end development.,We need a Mobile App Developer to enhance our ...
6,UX Designer,Here's a professional resume for Rachael Newma...,reject,Insufficient system design expertise for senio...,Help us build the next-generation products as ...
7,Cloud Engineer,"Here's a sample resume for Jessica Hall, a can...",reject,Needs improvement in machine learning algorithms.,We're seeking a talented Cloud Engineer to wor...
8,Digital Marketing Specialist,Here's a sample resume for Jonathan Powers:\n\...,select,Impressive leadership and communication abilit...,"As a Digital Marketing Specialist, you'll lead..."
9,AI Researcher,Here's a sample resume for Zachary Ward:\n\nZa...,select,Excellent full-stack development experience.,If you're passionate about software engineerin...


In [5]:
print(f'describe df:\n{df.describe()}', end='\n\n')
print(f'null values:\n{df.isna().sum()}', end='\n\n')
print(f'duplicated values:\n{df.duplicated().sum()}', end='\n\n')
print(f'col values:\n{df.columns}', end='\n\n')


describe df:
                  Role                                             Resume  \
count            10174                                              10174   
unique              45                                              10174   
top     Data Scientist  Here's a sample resume for Charlie Miller, a P...   
freq               538                                                  1   

       Decision                                Reason_for_decision  \
count     10174                                              10174   
unique        2                                                539   
top      reject  Insufficient system design expertise for senio...   
freq       5114                                                730   

                                          Job_Description  
count                                               10174  
unique                                               3446  
top     Join our team as a Product Manager and leverag...  
freq      

In [6]:
# col names
col = df.columns.tolist()
print(col)

['Role', 'Resume', 'Decision', 'Reason_for_decision', 'Job_Description']


### **Removing Stopwords**

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
stop = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = text.lower()
    text = " ".join([w for w in text.split() if w not in stop])
    return text

df["resume_clean"] = df["Resume"].apply(clean_text)

### **Labeling Data**

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
'''
label mapping:
    0: rejected
    1: selected (not accepted)
'''

le = LabelEncoder()
df['label'] = le.fit_transform(df['Decision'])
num_classes = df["label"].nunique()

df.head(10)

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description,resume_clean,label
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...,professional resume jason jones jason jones e ...,0
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...,professional resume ann marshall ann marshall ...,1
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...,professional resume patrick mcclain patrick mc...,0
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...,professional resume patricia gray patricia gra...,1
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...,professional resume amanda gross amanda gross ...,0
5,Mobile App Developer,"Here's a sample resume for Jose Hall, a skille...",reject,No experience in back-end development.,We need a Mobile App Developer to enhance our ...,sample resume jose hall skilled mobile app dev...,0
6,UX Designer,Here's a professional resume for Rachael Newma...,reject,Insufficient system design expertise for senio...,Help us build the next-generation products as ...,professional resume rachael newman applying ro...,0
7,Cloud Engineer,"Here's a sample resume for Jessica Hall, a can...",reject,Needs improvement in machine learning algorithms.,We're seeking a talented Cloud Engineer to wor...,sample resume jessica hall candidate applying ...,0
8,Digital Marketing Specialist,Here's a sample resume for Jonathan Powers:\n\...,select,Impressive leadership and communication abilit...,"As a Digital Marketing Specialist, you'll lead...",sample resume jonathan powers jonathan powers ...,1
9,AI Researcher,Here's a sample resume for Zachary Ward:\n\nZa...,select,Excellent full-stack development experience.,If you're passionate about software engineerin...,sample resume zachary ward zachary ward ai res...,1


### **Train Splitting**

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(
    df['resume_clean'],
    df['label'],
    test_size=0.2,
    random_state=12
    )

### **Tokenization**

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
_max_words = 20000
_max_len = 512

tokenizer = Tokenizer(num_words=_max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=_max_len, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=_max_len, padding='post')

### **Build Model**

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import layers, models

In [16]:
class Attention(layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], input_shape[-1]),
            initializer='glorot_uniform',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True
        )
        self.u = self.add_weight(
            name='attention_context',
            shape=(input_shape[-1],),
            initializer='glorot_uniform',
            trainable=True
        )
        super(Attention, self).build(input_shape)

    def call(self, x):
        # x shape: (batch_size, time_steps, features)
        # Calculate attention scores
        uit = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        ait = tf.tensordot(uit, self.u, axes=1)

        # Apply softmax to get attention weights
        a = tf.nn.softmax(ait, axis=1)

        # Apply attention weights
        a = tf.expand_dims(a, -1)
        weighted_input = x * a

        # Sum over time steps
        output = tf.reduce_sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [17]:
# https://www.geeksforgeeks.org/nlp/adding-attention-layer-to-a-bi-lstm/

model = Sequential([
    Embedding(input_dim=_max_words, output_dim=128, input_length=_max_len),
    LSTM(128, return_sequences=True),
    Attention(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()



### **Train Model**

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [19]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=0.00001
)

In [20]:
BI_LSTM_history = model.fit(
    x_train_pad,
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    callbacks=[early_stop, reduce_lr]
)

Epoch 1/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 45ms/step - accuracy: 0.5026 - loss: 0.6868 - val_accuracy: 0.5547 - val_loss: 0.6493 - learning_rate: 0.0010
Epoch 2/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.6582 - loss: 0.6008 - val_accuracy: 0.5504 - val_loss: 0.6540 - learning_rate: 0.0010
Epoch 3/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.8707 - loss: 0.3113 - val_accuracy: 0.5749 - val_loss: 0.7781 - learning_rate: 0.0010
Epoch 4/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.9676 - loss: 0.0938 - val_accuracy: 0.5577 - val_loss: 1.0066 - learning_rate: 5.0000e-04


In [21]:
loss, acc = model.evaluate(x_test_pad, y_test)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5318 - loss: 0.6558


In [22]:
print(f'acc: {acc}')
print(f'loss: {loss}')

acc: 0.5547911524772644
loss: 0.6521677374839783


In [26]:
import json

# Save the tokenizer configuration to tokenizer_config.json
tokenizer_json = tokenizer.to_json()
with open('tokenizer_config.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [27]:
import pickle

# save tokenizer
with open('tokenizer.json', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [29]:
model.save('acceptance_classification_model.')

ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=acceptance_classification_model.