# PDF Extraction

In [2]:
import os
from PyPDF2 import PdfReader
from tqdm import tqdm
import pandas as pd

In [23]:
pdfs_path = './data/pdfs'
data = []
for folder_name in os.listdir(pdfs_path):
    folder_path = os.path.join(pdfs_path, folder_name)

    if os.path.isdir(folder_path):
        for file_name in tqdm(os.listdir(folder_path), desc=f"Files in {folder_name}", leave=True):
            file_path = os.path.join(folder_path, file_name)
            
            if file_path.endswith('.pdf'):
                with open(file_path, 'rb') as file:
                    reader = PdfReader(file)
                    text = ''
                    for page in reader.pages:
                        text += page.extract_text().replace('\n', ' ')
                    data.append({'label': folder_name, 'resume': text})

Files in CONSULTANT: 100%|██████████| 115/115 [00:33<00:00,  3.45it/s]
Files in AUTOMOBILE: 100%|██████████| 36/36 [00:10<00:00,  3.59it/s]
Files in FINANCE: 100%|██████████| 118/118 [00:29<00:00,  4.05it/s]
Files in ACCOUNTANT: 100%|██████████| 118/118 [00:26<00:00,  4.40it/s]
Files in ADVOCATE: 100%|██████████| 118/118 [00:30<00:00,  3.89it/s]
Files in AGRICULTURE: 100%|██████████| 63/63 [00:15<00:00,  4.09it/s]
Files in PUBLIC-RELATIONS: 100%|██████████| 111/111 [00:28<00:00,  3.96it/s]
Files in FITNESS: 100%|██████████| 117/117 [00:27<00:00,  4.25it/s]
Files in APPAREL: 100%|██████████| 97/97 [00:24<00:00,  3.95it/s]
Files in AVIATION: 100%|██████████| 117/117 [00:29<00:00,  3.96it/s]
Files in CHEF: 100%|██████████| 118/118 [00:26<00:00,  4.52it/s]
Files in BPO: 100%|██████████| 22/22 [00:06<00:00,  3.35it/s]
Files in INFORMATION-TECHNOLOGY: 100%|██████████| 120/120 [00:34<00:00,  3.46it/s]
Files in ENGINEERING: 100%|██████████| 118/118 [00:29<00:00,  4.02it/s]
Files in HEALTHCARE:

In [None]:
df = pd.DataFrame(data)

In [6]:
df.to_csv('data/extract.csv', index=False)

# Data Preprocessing

## Setup

In [3]:
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
import spacy
from tqdm import tqdm

In [4]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tsarivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.read_csv('data/extract.csv')
nlp = spacy.load('en_core_web_lg')
STEMMER = SnowballStemmer(language='english') 
SCIKIT_STOPWORDS = text.ENGLISH_STOP_WORDS
NLTK_STOPWORDS = stopwords.words('english')

## Researching

In [10]:
def is_stopword(token):
   if(token.is_stop or \
      token.text in SCIKIT_STOPWORDS or \
      token.text in NLTK_STOPWORDS):
      return True
   else: 
      return False

In [11]:

doc = nlp('I am a dog running to outer space in the galaxy. hr HR bpo BPO')

for token in doc:
    print(f'{token.text}: {is_stopword(token)}')

I: True
am: True
a: True
dog: False
running: False
to: True
outer: False
space: False
in: True
the: True
galaxy: False
.: False
hr: False
HR: False
bpo: False
BPO: False


In [4]:
print('Length of stop words')
print(f'Spacy: {len(nlp.Defaults.stop_words)}')
print(f'Scikit-learn: {len(SCIKIT_STOPWORDS)}')
print(f'NLTK: {len(NLTK_STOPWORDS)}')

Length of stop words
Spacy: 326
Scikit-learn: 318
NLTK: 179


In [None]:
doc = nlp('. , " ! $ % Andy    212    ')
for token in doc:
    print(f'{token.text}: {token.is_alpha}')

.: False
,: False
": False
!: False
$: False
%: False
Andy: True
   : False
212: False
   : False


SnowballStemmer automatically lowercase the word

In [63]:
words = ['Ant', 'B', 'Beekeeper', 'beekeeper', 'Zpp', 'Dog']
for word in words:
    print(f"{word} => {STEMMER.stem(word)}")

Ant => ant
B => b
Beekeeper => beekeep
beekeeper => beekeep
Zpp => zpp
Dog => dog


## Preprocessing

In [39]:
print(f"Num of resumes: {len(df)}")

Num of resumes: 2484


In [40]:
df.isnull().sum()

label     0
resume    1
dtype: int64

In [41]:
df.dropna(inplace=True)
print(f"After checking and dropping NA values: {len(df)}")

After checking and dropping NA values: 2483


In [42]:
def preprocess(txt):
   doc = nlp(txt)
   result = ''

   for token in doc:
      # Only non-entity words are allowed except stopwords
      if(token.is_alpha and \
         not is_stopword(token) and \
         not token.ent_type_):
         preprocessed_txt = STEMMER.stem(token.text)
         result += f'{preprocessed_txt} '

   return result

In [43]:
cleaned_arr = []
for index, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing Resumes"):
    txt = row['resume']
    cleaned_txt = preprocess(txt)
    cleaned_arr.append(cleaned_txt)

df['clean'] = cleaned_arr

Preprocessing Resumes: 100%|██████████| 2483/2483 [05:54<00:00,  7.00it/s]


In [66]:
df['clean'][0]

'consult educ train mph skill highlight microsoft word compani consult identifi growth monitor promot gmp program implement deficit suggest recommend improv review avail project document locat potenti program gap conduct multipl field visit district southwest observ provid technic input ongo child nutrit activ specif focus session locat gap program particip data report monitor present research find multipl point time relev stakehold program offici compani help organ initi prevent arsenicosi district help creat public awar arsenic poison encourag peopl drink arsenic free water prevent arsenicosi help organ arsenic free water deliveri meet communiti need collabor communiti stakehold help provid poor resid gather distribut medic suppli provid physic examin consult collabor physician public health campaign volunt particip campaign creat awar ant natal check educ mother import breastfeed child birth rural conduct physic examin provid treatment follow servic dissemin inform work program aim 

In [45]:
df.to_csv("data/cleaned.csv", index=False)

# Model Training

## Setup   

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/cleaned.csv")

### Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [4]:
x = df['clean']
y = df['label']
y_encoded = encoder.fit_transform(y)

### Splitting to training and testing set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y_encoded, test_size = 0.2)

### Vectorizing 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

## Models

### RFC

In [45]:
%%time
# using grid search for find optimal values for RF classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

RFC = RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(estimator=RFC, 
                    param_grid=param_grid, 
                    cv= 5, 
                    scoring='accuracy', 
                    return_train_score=False, 
                    verbose=3)
grid.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.548 total time=   0.7s
[CV 2/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.516 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.521 total time=   0.6s
[CV 4/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.542 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=200;, score=0.519 total time=   0.6s
[CV 1/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=500;, score=0.570 total time=   1.5s
[CV 2/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=500;, score=0.529 total time=   1.6s
[CV 3/5] END criterion=gini, max_depth=4, max_features=sqrt, n_estimators=500;, score=0.549 total time=   1.6s
[CV 4/5] END criterion=gini, max_depth=4, max_feat

In [46]:
params = grid.best_params_
params

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'n_estimators': 500}

In [47]:
from sklearn.ensemble import RandomForestClassifier
RF_Model = RandomForestClassifier(random_state=42, 
                                  max_features=params['max_features'], 
                                  n_estimators=params['n_estimators'], 
                                  max_depth=params['max_depth'], 
                                  criterion=params['criterion'])
RF_Model.fit(X_train_tfidf, y_train)

In [49]:
y_pred_encoded = RF_Model.predict(X_test_tfidf)
y_pred = encoder.inverse_transform(y_pred_encoded)
y_pred

array(['BUSINESS-DEVELOPMENT', 'TEACHER', 'INFORMATION-TECHNOLOGY',
       'CONSTRUCTION', 'ENGINEERING', 'INFORMATION-TECHNOLOGY', 'HR',
       'TEACHER', 'ACCOUNTANT', 'CONSTRUCTION', 'CHEF', 'TEACHER',
       'DIGITAL-MEDIA', 'PUBLIC-RELATIONS', 'TEACHER',
       'INFORMATION-TECHNOLOGY', 'HEALTHCARE', 'BUSINESS-DEVELOPMENT',
       'FINANCE', 'HEALTHCARE', 'CHEF', 'BANKING',
       'INFORMATION-TECHNOLOGY', 'HEALTHCARE', 'ACCOUNTANT', 'FITNESS',
       'AVIATION', 'PUBLIC-RELATIONS', 'CHEF', 'BUSINESS-DEVELOPMENT',
       'FITNESS', 'ENGINEERING', 'TEACHER', 'CONSULTANT', 'CHEF', 'CHEF',
       'DIGITAL-MEDIA', 'ENGINEERING', 'PUBLIC-RELATIONS', 'ENGINEERING',
       'BANKING', 'APPAREL', 'APPAREL', 'ENGINEERING', 'HR', 'FITNESS',
       'CONSTRUCTION', 'BUSINESS-DEVELOPMENT', 'PUBLIC-RELATIONS',
       'INFORMATION-TECHNOLOGY', 'FINANCE', 'BANKING', 'HEALTHCARE',
       'BUSINESS-DEVELOPMENT', 'AVIATION', 'SALES', 'TEACHER', 'CHEF',
       'FINANCE', 'TEACHER', 'HEALTHCARE', 'FINA

In [50]:
print("training Score: {:.2f}".format(RF_Model.score(X_train_tfidf, y_train)))
print("test Score: {:.2f}".format(RF_Model.score(X_test_tfidf, y_test)))

training Score: 0.88
test Score: 0.55


In [55]:
from sklearn import metrics
print("model report: %s: \n %s\n" % (RF_Model, metrics.classification_report(y_test, y_pred)))

model report: RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42): 
                         precision    recall  f1-score   support

            ACCOUNTANT       0.50      1.00      0.67        21
              ADVOCATE       0.46      0.61      0.52        18
           AGRICULTURE       0.00      0.00      0.00        20
               APPAREL       1.00      0.31      0.47        26
                  ARTS       0.00      0.00      0.00        23
            AUTOMOBILE       0.00      0.00      0.00         5
              AVIATION       0.81      0.74      0.77        23
               BANKING       0.65      0.65      0.65        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.33      0.65      0.44        17
                  CHEF       0.59      0.94      0.72        18
          CONSTRUCTION       0.74      0.71      0.73        28
            CONSULTANT       0.50      0.03      0.06        30
              D

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Deep Learning 

### Transformer

#### Loading DistilBERT from Keras NLP

In [6]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from keras_nlp import models
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

2024-05-11 21:44:22.366127: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 21:44:22.369964: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 21:44:22.483378: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 21:44:22.843383: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# List to store lengths of sequences
sequence_lengths = []

# Iterate over the dataset and tokenize each sequence
for sequence in df['clean']:
    # Compute the length of the sequence in tokens
    sequence_length = len(sequence)
    
    # Append the length to the list
    sequence_lengths.append(sequence_length)

# Calculate statistics
max_length = max(sequence_lengths)
min_length = min(sequence_lengths)
avg_length = sum(sequence_lengths) / len(sequence_lengths)

# Print statistics
print("Maximum sequence length:", max_length)
print("Minimum sequence length:", min_length)
print("Average sequence length:", avg_length)

Maximum sequence length: 21035
Minimum sequence length: 246
Average sequence length: 3159.7120418848167


In [9]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = models.DistilBertPreprocessor.from_preset(preset,
                                                        sequence_length=21035,
                                                        name="preprocessor_4_tweets")

# Pretrained classifier.
classifier = models.DistilBertClassifier.from_preset(preset,
                                                    preprocessor = preprocessor, 
                                                    num_classes=2)

classifier.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/preprocessor.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/distil_bert/keras/distil_bert_base_en_uncased/2/download/task.json...


#### Fine-tuning BERT

In [10]:
from keras_core.losses import SparseCategoricalCrossentropy
from keras import optimizers

Using TensorFlow backend


In [11]:
BATCH_SIZE = 32
NUM_TRAINING_EXAMPLES = X_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [12]:
# Compile
classifier.compile(
    loss=SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=optimizers.Adam(1e-5),
    metrics= ["accuracy"]  
)

# Fit
history = classifier.fit(
    x=X_train,
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)

Epoch 1/2


2024-05-11 21:44:52.619480: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


: 

# Conclusion and Result