In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
!pip install transformers



In [3]:
import os
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments

In [4]:
data_path = "data.csv"
df = pd.read_csv(data_path)

In [5]:
dataset = Dataset.from_pandas(df)

In [6]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    inputs = ["extract skills: " + desc for desc in examples["Job Description"]]
    targets = [skill for skill in examples["Skills Required"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/503 [00:00<?, ? examples/s]



In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_t5_small",
    overwrite_output_dir=True,
    report_to="none",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
)



In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [12]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,3.7635
100,3.1692
150,2.9653
200,2.564
250,2.5676
300,2.3567
350,2.4055
400,2.1338
450,2.1978
500,2.1624


TrainOutput(global_step=1010, training_loss=2.3253708187896427, metrics={'train_runtime': 118.9697, 'train_samples_per_second': 42.28, 'train_steps_per_second': 8.49, 'total_flos': 62663254081536.0, 'train_loss': 2.3253708187896427, 'epoch': 10.0})

In [24]:
trainer.save_model("./fine_tuned_t5_small")
tokenizer.save_pretrained("fine_tuned_t5_small/")
print("Model fine-tuning complete and saved to './fine_tuned_t5_small'")

Model fine-tuning complete and saved to './fine_tuned_t5_small'


In [14]:
def generate_skills(job_description, tokenizer, model):
    input_text = "extract skills: " + job_description
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    input_ids = input_ids.to(model.device)

    outputs = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    generated_skills = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_skills

In [15]:
job_desc_example = (
    "Proven experience in React Native and Tailwind CSS Solid Knowledge of state management patterns, such as Redux Strong proficiency in JavaScript and ES6+ Experience with Node.js and Express for server-side development Familiarity with MongoDB or similar NoSQL databases Expertise in RESTful APIs and integration of third-party libraries Solid understanding of the mobile development life cycle Ability to produce well-documented, clean, and efficient code Strong problem-solving and communication skills Bachelors degree in Computer Science or a related field (preferred)"
)

In [16]:
predicted_skills = generate_skills(job_desc_example, tokenizer, model)
print("Predicted Skills for the Job Description:", predicted_skills)

Predicted Skills for the Job Description: [RESTful APIs, RESTful APIs, RESTful APIs, RESTful APIs]


In [25]:
extraction_model_dir = "fine_tuned_t5_small/"
tokenizer_extraction = T5Tokenizer.from_pretrained(extraction_model_dir)
model_extraction = T5ForConditionalGeneration.from_pretrained(extraction_model_dir)
model_extraction.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [17]:
#!zip -r dataset.zip /content/fine_tuned_t5_small

In [18]:
#from google.colab import files
#files.download('dataset.zip')