In [7]:
!pip install PyPDF2 nltk pandas google-colab


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, jedi
Successfully installed PyPDF2-3.0.1 jedi-0.19.1


In [73]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from google.colab import drive



# Mount Google Drive
drive.mount('/content/drive')
# Load labeled dataset
labeled_data_path = '/content/drive/MyDrive/Research/My/PreprocessedData/labeled_ksa_dataset.csv'
data = pd.read_csv(labeled_data_path)

# Check the structure of the data
print(data.head())

# Preprocess the data
X = data['Text']  # Features (resume text)
y = data['Label']  # Labels (KSA categories)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train an SVM classifier
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Resume ID                                               Text      Label
0          6  70% Back-end (Java/Spring/SQL/NoSQL) 30% Front...     Skills
1          6  Knowledge of Java, JavaScript, TypeScript, C#,...  Knowledge
2          6  Participated in design and development of an e...  Abilities
3          6  Responsible for achieving target KPI during cu...  Abilities
4          7  2 years of experience in frontend development ...     Skills
              precision    recall  f1-score   support

   Abilities       0.57      0.57      0.57         7
   Knowledge       1.00      0.43      0.60         7
      Skills       0.72      0.93      0.81        14

    accuracy                           0.71        28
   macro avg       0.76      0.64      0.66        28
weighted avg       0.75      0.71      0.70        28



In [74]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate the tuned model
y_pred = best_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
              precision    recall  f1-score   support

   Abilities       0.56      0.71      0.62         7
   Knowledge       1.00      0.43      0.60         7
      Skills       0.81      0.93      0.87        14

    accuracy                           0.75        28
   macro avg       0.79      0.69      0.70        28
weighted avg       0.80      0.75      0.74        28



In [75]:
import pickle

# Assuming you have the trained model and TF-IDF vectorizer
# Save the model to a file
with open('/content/drive/MyDrive/Research/My/Model/model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the TF-IDF vectorizer to a file
with open('/content/drive/MyDrive/Research/My/Model/vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


In [76]:
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pickle
import re

# Load the trained model and vectorizer
with open('/content/drive/MyDrive/Research/My/Model/model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('/content/drive/MyDrive/Research/My/Model/vectorizer.pkl', 'rb') as vec_file:
    vectorizer = pickle.load(vec_file)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text

# Function to split text into sentences/paragraphs
def split_into_segments(text):
    # Splitting based on full stops, new lines, etc.
    segments = re.split(r'(?<=[.!?]) +|\n', text)
    return [segment.strip() for segment in segments if len(segment.strip()) > 0]

# Path to the new resume (PDF) you want to test
new_resume_path = '/content/drive/MyDrive/Research/My/Test/50.pdf'

# Step 1: Convert the new resume to text
new_resume_text = extract_text_from_pdf(new_resume_path)

# Step 2: Split the text into segments (sentences/paragraphs)
resume_segments = split_into_segments(new_resume_text)

# Step 3: Preprocess each segment using the trained TF-IDF vectorizer
resume_tfidf = vectorizer.transform(resume_segments)

# Step 4: Predict the KSAs for each segment
predicted_ksas = model.predict(resume_tfidf)

# Step 5: Output the results
for segment, ksa in zip(resume_segments, predicted_ksas):
    print(f"Text: {segment}\nPredicted KSA: {ksa}\n{'-'*50}")


Text: Full-time, on-site, remote or hybrid.
Predicted KSA: Skills
--------------------------------------------------
Text: DOB: 1984.
Predicted KSA: Skills
--------------------------------------------------
Text: Aliyah: .
Predicted KSA: Skills
--------------------------------------------------
Text: Residence: Kiryat Ono, Montefiore, 13.
Predicted KSA: Skills
--------------------------------------------------
Text: My name is Maria, Im a new repatriate from Russia, made an Aliyah in April.
Predicted KSA: Skills
--------------------------------------------------
Text: I work as a
Predicted KSA: Skills
--------------------------------------------------
Text: senior system
Predicted KSA: Abilities
--------------------------------------------------
Text: analyst, a senior business analyst, a team leader and a project manager.
Predicted KSA: Abilities
--------------------------------------------------
Text: Understanding the importance of language in my profession, Im actively studying Heb

## **Full Code to Fine-Tune BERT:
**

In [46]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Research/My/PreprocessedData/labeled_ksa_dataset.csv')

# Encode the labels (Knowledge: 0, Skills: 1, Abilities: 2)
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,  # You can adjust this based on your data
        return_tensors="pt"
    )

X_train_tokenized = tokenize_function(list(X_train))
X_test_tokenized = tokenize_function(list(X_test))




In [47]:
from transformers import BertForSequenceClassification

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 labels for KSA


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
from transformers import Trainer, TrainingArguments
import torch

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate each epoch
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

def convert_to_hf_dataset(tokenized_inputs, labels):
    return Dataset.from_dict({
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels
    })

# Convert train and test sets to Hugging Face Dataset
train_dataset = convert_to_hf_dataset(X_train_tokenized, list(y_train))
test_dataset = convert_to_hf_dataset(X_test_tokenized, list(y_test))

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,0.812401
2,No log,0.732744
3,No log,0.717975


TrainOutput(global_step=21, training_loss=0.7318045752389091, metrics={'train_runtime': 214.8303, 'train_samples_per_second': 1.55, 'train_steps_per_second': 0.098, 'total_flos': 6845060008080.0, 'train_loss': 0.7318045752389091, 'epoch': 3.0})

In [60]:
!pip install scikit-learn




In [61]:
from sklearn.metrics import accuracy_score

# Define the compute_metrics function for accuracy
def compute_metrics(p):
    preds = p.predictions.argmax(-1)  # Get the predicted class with the highest score
    accuracy = accuracy_score(p.label_ids, preds)  # Calculate accuracy manually
    return {"accuracy": accuracy}

# Update the Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Pass the manual accuracy function
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()

# Print loss and accuracy
print(f"Test Loss: {results['eval_loss']}")
print(f"Test Accuracy: {results['eval_accuracy']}")


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.777147,0.678571
2,No log,0.65216,0.785714
3,No log,0.622135,0.785714


Test Loss: 0.6221352815628052
Test Accuracy: 0.7857142857142857


In [None]:
import PyPDF2
from transformers import BertTokenizer
import torch

# Load the fine-tuned model and tokenizer
model_path = "/path_to_your_fine_tuned_model"  # Path to your fine-tuned BERT model
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)  # Use PyPDF2 to read PDF
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text

# Function to split text into sentences/paragraphs
def split_into_segments(text):
    # Splitting based on full stops, new lines, etc.
    segments = re.split(r'(?<=[.!?]) +|\n', text)
    return [segment.strip() for segment in segments if len(segment.strip()) > 0]

# Step 1: Extract the resume from the PDF
new_resume_path = '/content/drive/MyDrive/Research/My/Test/50.pdf'
new_resume_text = extract_text_from_pdf(new_resume_path)

# Step 2: Split the text into segments (sentences or paragraphs)
resume_segments = split_into_segments(new_resume_text)

# Step 3: Preprocess each segment using the BERT tokenizer
tokenized_segments = tokenizer(
    resume_segments,
    padding=True,
    truncation=True,
    max_length=128,  # You can adjust this based on your data
    return_tensors="pt"
)

# Step 4: Predict the KSAs for each segment
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**tokenized_segments)
    predictions = outputs.logits.argmax(dim=-1)

# Map the predicted labels back to KSA classes
label_map = {0: "Knowledge", 1: "Skills", 2: "Abilities"}  # Ensure this matches your label mapping
predicted_ksas = [label_map[pred.item()] for pred in predictions]

# Step 5: Output the results
for segment, ksa in zip(resume_segments, predicted_ksas):
    print(f"Text: {segment}\nPredicted KSA: {ksa}\n{'-'*50}")
