In [1]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer
import pandas as pd
from datasets import DatasetDict, Dataset, load_metric
from sklearn.model_selection import train_test_split
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from training import data_preprocessing
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/55068 [00:00<?, ? examples/s]

Map:   0%|          | 0/11800 [00:00<?, ? examples/s]

Map:   0%|          | 0/11801 [00:00<?, ? examples/s]

  metric = load_metric("accuracy")


## Load the model

In [2]:
# Load the tokenizer and model from the saved directory
saved_model_path = './albert-finetuned'
tokenizer = AlbertTokenizer.from_pretrained(saved_model_path)
model = AlbertForSequenceClassification.from_pretrained(saved_model_path)
model.eval()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [3]:
ds = load_dataset("ganchengguang/resume_seven_class")
label_mapping = {'PI': 0, 'Exp': 1, 'Sum': 2, 'Edu': 3, 'QC': 4, 'Skill': 5, 'Obj': 6, '': -1}
train_dataset, val_dataset, test_dataset = data_preprocessing(ds)
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

In [22]:
# Tokenize the test dataset
def preprocess_function(examples):
    texts = examples['text']
    return tokenizer(texts, truncation=True, padding=True)

tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/11801 [00:00<?, ? examples/s]

In [23]:
# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=torch.tensor(labels))


In [24]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
# Load the tokenizer and model from the saved directory
saved_model_path = './albert-finetuned'
tokenizer = AlbertTokenizer.from_pretrained(saved_model_path)
model = AlbertForSequenceClassification.from_pretrained(saved_model_path)

# Tokenize the test dataset
def preprocess_function(examples):
    texts = examples['text']
    return tokenizer(texts, truncation=True, padding=True)

tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Create a new Trainer instance
trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

# Function to compute detailed metrics
def compute_detailed_metrics(predictions, labels):
    preds = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to evaluate the model on the test set
def evaluate_model(trainer, tokenized_ds):
    predictions, labels, _ = trainer.predict(tokenized_ds['test'])
    
    metrics = compute_detailed_metrics(predictions, labels)
    
    print(classification_report(labels, np.argmax(predictions, axis=1), target_names=list(label_mapping.keys())[:-1]))
    
    return metrics

# Perform the evaluation
metrics = evaluate_model(trainer, {'test': tokenized_test})

# Print the computed metrics
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1-Score: {metrics['f1']:.4f}")

Map:   0%|          | 0/11801 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

          PI       0.92      0.89      0.91      1947
         Exp       0.88      0.92      0.90      6189
         Sum       0.67      0.61      0.64       970
         Edu       0.88      0.89      0.89      1463
          QC       0.63      0.44      0.52       160
       Skill       0.73      0.65      0.69       750
         Obj       0.87      0.85      0.86       322

    accuracy                           0.86     11801
   macro avg       0.80      0.75      0.77     11801
weighted avg       0.86      0.86      0.86     11801

Accuracy: 0.8583
Precision: 0.8551
Recall: 0.8583
F1-Score: 0.8560


# Testing the model

In [4]:
# Make sure to put the model in evaluation mode
model.eval()

# Example input sentences
input_texts = [
    "Highly motivated Data Science engineer with a strong passionfor data and technology. My studies in artificial intelligence andstatistical analysis have equipped me to tackle real-world datachallenges with different International companies working ondifferent projects using NLP, Predictive Maintenance andComputer Vision.",
    "Roukaia Khelifi",
    "Machine learning Deep learning Python SQL PLSQL  NLP Git/GIthub "
]

# Tokenize the input texts
inputs = tokenizer(input_texts, truncation=True, padding=True, return_tensors='pt')

# Get predictions from the model
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to predicted class labels
predictions = torch.argmax(logits, dim=-1)

inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Convert predictions to label names
predicted_labels = [inverse_label_mapping[p.item()] for p in predictions]

# Print out the results
for text, label in zip(input_texts, predicted_labels):
    print(f"Input: {text}\nPredicted Label: {label}\n")


Input: Highly motivated Data Science engineer with a strong passionfor data and technology. My studies in artificial intelligence andstatistical analysis have equipped me to tackle real-world datachallenges with different International companies working ondifferent projects using NLP, Predictive Maintenance andComputer Vision.
Predicted Label: Sum

Input: Roukaia Khelifi
Predicted Label: PI

Input: Machine learning Deep learning Python SQL PLSQL  NLP Git/GIthub 
Predicted Label: Skill



## Testing the Model from an extracted PDF Resume

In [11]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

pdf_path = 'Roukaia Khelifi Cv.pdf'
resume_text = extract_text_from_pdf(pdf_path)

print(resume_text)


Roukaia Khelifi
Data Scientist
About me
Accomplished Projects
AchievementsHighly motivated Data Science engineer with a strong passion
for data and technology. My studies in artificial intelligence and
statistical analysis have equipped me to tackle real-world data
challenges with different International companies working on
different projects using NLP, Predictive Maintenance and
Computer Vision.
Technical Skills:
Machine learning
Deep learning
Python
SQL / PLSQL
NLP
Git/GIthub
LLMs
Education Background
The Private Higher School of
Engineering and Technology, ESPRIT
Computer Science
September 2019 - July 2024
Khaireddine Pacha Ariana High SchoolMy Contact
roukaia70@gmail.com
Ariana, Tunis+216 29 043 930
github.com/RoukaiaKHELIFI
Languages
English (Native)
French (B2 - Intermediate)www.linkedin.com/in/roukaia-
khelifi-046365205/
Hobbies
Continuous Learning: Reading,
Studying, Exploring New Topics, and
Documenting.
Print Design: Adobe Photoshop
(Flyers, Posters, Brochures, Business
Card

In [12]:
import re

# Split the text into sentences or lines
resume_lines = resume_text.splitlines()

# Clean up empty lines or unnecessary spaces
resume_lines = [line.strip() for line in resume_lines if line.strip()]

# Print the lines
for line in resume_lines:
    print(line)


Roukaia Khelifi
Data Scientist
About me
Accomplished Projects
AchievementsHighly motivated Data Science engineer with a strong passion
for data and technology. My studies in artificial intelligence and
statistical analysis have equipped me to tackle real-world data
challenges with different International companies working on
different projects using NLP, Predictive Maintenance and
Computer Vision.
Technical Skills:
Machine learning
Deep learning
Python
SQL / PLSQL
NLP
Git/GIthub
LLMs
Education Background
The Private Higher School of
Engineering and Technology, ESPRIT
Computer Science
September 2019 - July 2024
Khaireddine Pacha Ariana High SchoolMy Contact
roukaia70@gmail.com
Ariana, Tunis+216 29 043 930
github.com/RoukaiaKHELIFI
Languages
English (Native)
French (B2 - Intermediate)www.linkedin.com/in/roukaia-
khelifi-046365205/
Hobbies
Continuous Learning: Reading,
Studying, Exploring New Topics, and
Documenting.
Print Design: Adobe Photoshop
(Flyers, Posters, Brochures, Business
Card

In [13]:
# Tokenize and classify each line
for line in resume_lines:
    inputs = tokenizer(line, truncation=True, padding=True, return_tensors='pt')

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to predicted class label
    prediction = torch.argmax(logits, dim=-1).item()
    predicted_label = inverse_label_mapping[prediction]

    # Print the line with its predicted label
    print(f"Line: {line}\nPredicted Label: {predicted_label}\n")

Line: Roukaia Khelifi
Predicted Label: PI

Line: Data Scientist
Predicted Label: Skill

Line: About me
Predicted Label: PI

Line: Accomplished Projects
Predicted Label: Exp

Line: AchievementsHighly motivated Data Science engineer with a strong passion
Predicted Label: Sum

Line: for data and technology. My studies in artificial intelligence and
Predicted Label: Sum

Line: statistical analysis have equipped me to tackle real-world data
Predicted Label: Exp

Line: challenges with different International companies working on
Predicted Label: Exp

Line: different projects using NLP, Predictive Maintenance and
Predicted Label: Exp

Line: Computer Vision.
Predicted Label: Skill

Line: Technical Skills:
Predicted Label: Skill

Line: Machine learning
Predicted Label: Skill

Line: Deep learning
Predicted Label: Skill

Line: Python
Predicted Label: Skill

Line: SQL / PLSQL
Predicted Label: Skill

Line: NLP
Predicted Label: Edu

Line: Git/GIthub
Predicted Label: Exp

Line: LLMs
Predicted Label: 