# Load Dependencies

Download and install dependancies

In [None]:
# !pip install assemblyai

In [None]:
# !pip install pytube

In [None]:
import pandas as pd

from pathlib import Path
from google.colab import drive

import spacy
import torch
import torch.nn as nn
from transformers import BertTokenizer, BartTokenizer, DistilBertTokenizer
from transformers import BertModel, DistilBertModel, BartForConditionalGeneration, BertForSequenceClassification
from transformers import pipeline

import logging

In [None]:
# import assemblyai as aai
# from pytube import YouTube

In [None]:
# Mount Google Drive to access files from Colab
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

### Video Transcription

In [None]:
def transcribe_from_url(url):
    """
    Transcribe audio from the specified URL.

    Args:
        url (str): The URL of the video containing the audio to transcribe.

    Returns:
        str: The transcribed text from the audio in the video. Returns an error message if transcription fails.
    """
    try:
        # Download the video
        yt = YouTube(url)
        video = yt.streams.filter(only_audio=True).first()
        if video is None:
            raise ValueError("No audio stream found in the video.")
        video.download(output_path='path_to_save')

        # Transcribe audio
        transcriber = aai.Transcriber()
        audio_path = f"./path_to_save/{video.default_filename}"
        transcript = transcriber.transcribe(audio_path)

        return transcript.text
    except Exception as e:
        return f"Error occurred: {str(e)}"



In [None]:
def transcribe_from_file(filepath):
    """
    Transcribe audio from the specified file.

    Args:
        filepath (str): The path to the audio file to transcribe.

    Returns:
        str: The transcribed text from the audio file. Returns an error message if transcription fails.
    """
    try:
        # Create a transcriber object
        transcriber = aai.Transcriber()
        # Transcribe audio from the specified file
        transcript = transcriber.transcribe(data=filepath)

        return transcript.text
    except Exception as e:
        return f"Error occurred: {str(e)}"

##### Examples

In [None]:
# transcribe_from_file('/content/path_to_save/5 Area To Focus For Data Science Interviews🔥🔥 datascience.mp4')

In [None]:
# transcribe_from_url('https://www.youtube.com/shorts/yZMw2rOKYwE')

### Summarization

In [None]:
# Load pre-trained BART model and tokenizer
# Using the bart-base model can save resources, if we need better performance we can use the "facebook/bart-large-cnn"

def initialize_summarizer(model_name="facebook/bart-base"):
    """
    Initialize the BART model and tokenizer for text summarization.

    Args:
    - model_name (str): Name of the pre-trained BART model to use.

    Returns:
    - summarizer: Initialized text summarization pipeline.
    """
    try:
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
        summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
        return summarizer
    except Exception as e:
        raise Exception(f"Error initializing summarizer: {str(e)}")

In [None]:
def summarize_text(text, summarizer):
  """
  Summarize the input text using the provided summarizer pipeline.

  Args:
  - text (str): Input text to be summarized.
  - summarizer: Initialized text summarization pipeline.

  Returns:
  - summary (str): Summarized text.
  """
  try:
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
    return summary
  except Exception as e:
      raise Exception(f"Error summarizing text: {str(e)}")

### Skill extraction

In [None]:
#Function to load skill extraction pipeline
def load_skill_extraction_pipeline():
    """
    Load a BERT-based Named Entity Recognition (NER) model for skills extraction.

    Returns:
        pipeline: A pipeline object for skill extraction using the loaded NER model.
                  Returns None if an error occurs during loading.
    """
    try:
        model_name = "GalalEwida/LLM-BERT-Model-Based-Skills-Extraction-from-jobdescription"
        ner = pipeline(task="ner", model=model_name)
        print("NER for skills extraction loaded loaded successflly.")
        return ner
    except Exception as e:
        raise Exception("Error occurred while loading the skill extraction pipeline:", e)
        return None

In [None]:
# Function to extract skills from text using the NER model
def extract_skills(input, ner):
    """
    Extract skills from the given input text using the provided NER model.

    Args:
        input (str): The input text from which skills are to be extracted.
        ner (pipeline): The NER model pipeline for skill extraction.

    Returns:
        list: A list of extracted skills. Returns None if an error occurs during extraction.
    """
    try:
        predictions = ner(input)
        print("Skills extracted successfully.")
        return predictions
    except Exception as e:
        raise Exception("Error occurred while extracting skills:", e)
        return None

In [None]:
#function to turn the raw output to a list of skills
def skill_preprocessing(input):
    """
    Preprocess the output of the skill extraction model.

    Args:
        input (list): A list of dictionaries representing the output of the skill extraction model.
                      Each dictionary contains 'word' and 'entity' keys representing a predicted token and its entity label.

    Returns:
        list: A list of unique skills extracted from the input.
              Returns None if an error occurs during preprocessing.
    """
    try:
        skill_set = set()
        current_skill = None
        for element in input:
            word = element.get('word')
            entity_label = element.get('entity')
            if word is None or entity_label is None:
                continue
            if entity_label.startswith('B'):
                if current_skill:
                    skill_set.add(current_skill)
                current_skill = word
            elif entity_label.startswith('I'):
                if current_skill:
                    if '##' in word:
                        word = word.replace('##', '')
                        current_skill += word
                    else:
                        current_skill += ' ' + word
        if current_skill:
            skill_set.add(current_skill)
        print("Skill set extraced successfully.")
        return list(skill_set)
    except Exception as e:
        print("Error occurred during skill preprocessing:", e)
        return None

### Sentence Extraction

In [None]:
# Extract entities related to the topic
def sentence_extraction(doc, skill_set):
    doc = nlp(doc)
    topic_sentences = {}
    for skill in skill_set:
      topic_sentences[skill] = [sent.text for sent in doc.sents if skill in sent.text.lower()]
    print("Entities related to skills extracted successfully.")
    return topic_sentences

##### Examples

In [None]:
# predictions = extract_skills(positive_answer , ner)

In [None]:
# skill_list = skill_preprocessing(predictions)

In [None]:
# topic_sentences = sentence_extraction(positive_answer , skill_list)

In [None]:
# topic_sentences

In [None]:
# topic_sentences['machine learning'][0]

In [None]:
# topic_sentences['neural networks'][1]

In [None]:
# ner = load_skill_extraction_pipeline()

In [None]:
# skill_preprocessing(extract_skills(NLP_fundamentals, ner))

In [None]:
# skill_preprocessing(extract_skills(models, ner))

### Skills Classification

In [None]:
class ScreenBert(nn.Module):
    def __init__(self, model_name, droput_prob=0.1):
        super(ScreenBert, self).__init__()
        self.bert = model_family.from_pretrained(model_name)
        self.dropout = nn.Dropout(droput_prob)
        self.fc = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

In [None]:
def load_model():
    # Prepare for model loading. If running on GPU, assign compute unified device architecture
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f'GPU computation device: {device}')
    else:
        device = torch.device('cpu')
        print(f'Computation device: {device}')

    # Instantiate model
    print(f'Loading model ...')
    model = ScreenBert(model_name).to(device)
    # Load trained parameters into the model
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f'Model successfully loaded.')

    # Return the loaded model and device
    return model, device

In [None]:
def load_tokenizer():
    # Load tokenizer
    print(f'Loading tokenizer ...')
    tokenizer = tokenizer_family.from_pretrained(tokenizer_name)
    print("Tokenizer successfully loaded.")
    return tokenizer

In [None]:
def classify_this(text):
    inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        raw_scores = model(input_ids, attention_mask)
        print(f'Raw scores for both classes (positive and negative) = {raw_scores}')

    probabilities = torch.sigmoid(raw_scores).item()

    # here we could introduce multi-classification
    prediction = 1 if probabilities > 0.5 else 0

    return prediction, probabilities

# Candidate Scoring

In [None]:
def score_candidate(n_questions, ):

    return n_1 / n_questions

# Pipeline

In [None]:
def run_pipeline(text):

    predictions = extract_skills(text, ner)
    skill_list = skill_preprocessing(predictions)

    topic_sentences = sentence_extraction(text, skill_list)

    print("Classifying ...")

    n_1 = 0
    score = 0

    for skill, sentences in topic_sentences.items():
        for sentence in sentences:
            prediction, probability = classify_this(sentence)
            if prediction == 1:
               n_1 += 1
            print('Skill:', skill)
            print('Sentence:', sentence)
            print('Prediction:', prediction)
            print('Probability:', probability, '\n')

    n_topic_sentences = len(topic_sentences)

    if n_topic_sentences != 0:
        print('Calculating score ...')
        score = round(n_1 / n_topic_sentences * 100)
        print(f'n_topic_sentences = {n_topic_sentences}')
        print(f'Candidate score: {score}%')

# Plug And Play

In [None]:
#Assembly AI Key
# aai.settings.api_key = "768d2e813e7f4787958bbd2c653ef839"

# Sebastian's path
# model_path = Path('/content/drive/MyDrive/Screening/screen_bert_v0_2_64.pth')
# Paul's Path
model_path = Path('/content/drive/MyDrive/Projects/Screening/screen_bert_1fc_v0_3_8.pth')

# assign variables
model_family = BertModel
model_name = 'bert-base-uncased'
tokenizer_family = BertTokenizer
tokenizer_name = 'bert-base-uncased'

# Initialize global variables
# summariser = initialize_summarizer(model_name="facebook/bart-base") # if needed
ner = load_skill_extraction_pipeline()
nlp = spacy.load("en_core_web_sm")
model, device = load_model()
tokenizer = load_tokenizer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

NER for skills extraction loaded loaded successflly.
Computation device: cpu
Loading model ...


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model successfully loaded.
Loading tokenizer ...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer successfully loaded.


In [None]:
# Examples
positive_answer = '''
Yes, I'm familiar with machine learning algorithms.
They're commonly used in image recognition and classification tasks due to their ability to automatically learn hierarchical features from data.
For example, in a project I worked on during my studies, I used a CNN to classify different types of fruits from images.
'''
negative_answer = '''
I've heard the term machine learning before, but I must admit that I don't have extensive knowledge about them.
Could you please provide more context or explain what LLMs are?
I'm eager to learn more about them.
'''

In [None]:
run_pipeline(positive_answer)

Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
Raw scores for both classes (positive and negative) = tensor([[8.3977]])
Skill: cnn
Sentence: For example, in a project I worked on during my studies, I used a CNN to classify different types of fruits from images.

Prediction: 1
Probability: 0.9997746348381042 

Raw scores for both classes (positive and negative) = tensor([[7.4319]])
Skill: algorithms
Sentence: 
Yes, I'm familiar with machine learning algorithms.

Prediction: 1
Probability: 0.9994082450866699 

Raw scores for both classes (positive and negative) = tensor([[8.3865]])
Skill: image recognition
Sentence: They're commonly used in image recognition and classification tasks due to their ability to automatically learn hierarchical features from data.

Prediction: 1
Probability: 0.9997721314430237 

Raw scores for both classes (positive and negative) = tensor([[7.6417]])
Skill: machine learning
Sen

In [None]:
run_pipeline(negative_answer)

Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
Raw scores for both classes (positive and negative) = tensor([[-8.8566]])
Skill: machine learning
Sentence: 
I've heard the term machine learning before, but I must admit that I don't have extensive knowledge about them.

Prediction: 0
Probability: 0.00014241528697311878 

Calculating score ...
1
Candidate score: 0


In [None]:
example_a = "Yes, I'm well-versed in machine learning and neural networks, which are crucial components of data science. However, I haven't delved into HTML extensively."
example_b = "Yes, I'm proficient in Java and C++, and I have experience with web development. While I'm comfortable with back-end development, I haven't focused extensively on front-end technologies. I'm eager to learn and contribute to projects that require front-end expertise."
example_c = "While I have some experience with SQL databases, I haven't had much exposure to cloud computing, cybersecurity, UX/UI design, or mobile app development. However, I'm eager to learn and adapt to new technologies as required for the position."
example_d = "While I have extensive experience in branding and print design, I haven't focused as much on motion graphics, user experience design, 3D modeling, or augmented reality. Nevertheless, I'm open to learning and integrating these aspects into my skill set to excel in a senior graphic design role."
example_lst = [example_a, example_b, example_c, example_d]

In [None]:
for example in example_lst:
    run_pipeline(example)
    print('---')

Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
Raw scores for both classes (positive and negative) = tensor([[7.6658]])
Skill: data science
Sentence: Yes, I'm well-versed in machine learning and neural networks, which are crucial components of data science.
Prediction: 1
Probability: 0.999531626701355 

Raw scores for both classes (positive and negative) = tensor([[-9.0674]])
Skill: html
Sentence: However, I haven't delved into HTML extensively.
Prediction: 0
Probability: 0.00011535565136000514 

Raw scores for both classes (positive and negative) = tensor([[7.7226]])
Skill: neural networks
Sentence: Yes, I'm well-versed in machine learning and neural networks, which are crucial components of data science.
Prediction: 1
Probability: 0.9995574355125427 

Raw scores for both classes (positive and negative) = tensor([[7.7231]])
Skill: machine learning
Sentence: Yes, I'm well-versed in machine learning and 

In [None]:
# More examples
text1 = '''i know computer'''
text2 = '''i know science'''
text3 = '''i know computer science'''
print(run_pipeline(text1))
print('---')
print(run_pipeline(text2))
print('---')
print(run_pipeline(text3))

Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
None
---
Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
None
---
Skills extracted successfully.
Skill set extraced successfully.
Entities related to skills extracted successfully.
Classifying ...
Raw scores for both classes (positive and negative) = tensor([[7.1724]])
Skill: computer science
Sentence: i know computer science
Prediction: 1
Probability: 0.9992331266403198 

Calculating score ...
n_topic_sentences = 1
Candidate score: 100%
None
