In [None]:
!pip install datasets
!pip install fastapi
!pip install torchserve
!pip install evaluate
!pip install rouge_score

In [None]:
!pip install -U accelerate
!pip install -U transformers

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, AutoTokenizer
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from huggingface_hub import notebook_login
import torch
import evaluate
import os
import numpy as np

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def lire_fichier(fichier):
  with open(fichier, "r") as f:
    contenu = f.read()
  return contenu

def read_dataset(articles, resumes):
  datasets = []
  for i in range(10):
    summary = lire_fichier(resumes[i])
    text = lire_fichier(articles[i])

    element = {
          "summary": summary,
          "text": text
    }

    datasets.append(element)
    return datasets

def read_dataset_bis(articles, resumes):
  datasets = {"text": [], "summary": []}
  for i in range(10):
    summary = lire_fichier(resumes[i])
    text = lire_fichier(articles[i])

    datasets["summary"].append(summary)
    datasets["text"].append(text)

  return datasets

In [None]:
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["text"]]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

  labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Paramètres du modèle
checkpoint = "t5-small"
prefix = "summarize: "
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Chargement du jeu de données

articles = ["/content/articles/article_01.txt","/content/articles/article_02.txt","/content/articles/article_03.txt","/content/articles/article_04.txt","/content/articles/article_05.txt","/content/articles/article_06.txt","/content/articles/article_07.txt","/content/articles/article_08.txt","/content/articles/article_09.txt","/content/articles/article_10.txt"]
resumes = ["/content/resumes/resume_01.txt","/content/resumes/resume_02.txt","/content/resumes/resume_03.txt","/content/resumes/resume_04.txt","/content/resumes/resume_05.txt","/content/resumes/resume_06.txt","/content/resumes/resume_07.txt","/content/resumes/resume_08.txt","/content/resumes/resume_09.txt","/content/resumes/resume_10.txt"]

datasets = read_dataset_bis(articles, resumes)
datasets = Dataset.from_dict(datasets)
datasets = datasets.train_test_split(test_size=0.2)

In [None]:
# Prétraitement du jeu de données
tokenized_dataset = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
# Evaluate
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="HAR_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.push_to_hub()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.957941,0.312,0.0738,0.2003,0.2003,75.0
2,No log,2.885518,0.312,0.0738,0.2003,0.2003,75.0
3,No log,2.838079,0.3376,0.0808,0.205,0.205,77.5
4,No log,2.792886,0.3383,0.0903,0.2018,0.2018,74.5
5,No log,2.738932,0.3383,0.0903,0.2018,0.2018,74.5
6,No log,2.664049,0.3383,0.0903,0.2018,0.2018,74.5
7,No log,2.633348,0.3422,0.0916,0.1961,0.1961,72.0
8,No log,2.611012,0.3383,0.0903,0.2018,0.2018,74.5
9,No log,2.595095,0.3529,0.1071,0.2263,0.2263,86.0
10,No log,2.582576,0.3529,0.1071,0.2263,0.2263,86.0


Non-default generation parameters: {'max_length': 200, 'min_length': 30, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

events.out.tfevents.1708002561.35a56fc0d5cb.732.0:   0%|          | 0.00/7.72k [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

events.out.tfevents.1708003121.35a56fc0d5cb.732.1:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

events.out.tfevents.1708004348.35a56fc0d5cb.17046.1:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

events.out.tfevents.1708003698.35a56fc0d5cb.17046.0:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

events.out.tfevents.1708004387.35a56fc0d5cb.17046.2:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Yuss68/HAR_model/commit/f99e4640ea20e4f6057fdf9aaaf1794d517e4392', commit_message='End of training', commit_description='', oid='f99e4640ea20e4f6057fdf9aaaf1794d517e4392', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
text = prefix + lire_fichier("/content/articles/article_2_01.txt")
text

'summarize: C#: A Versatile Language for Modern Development\nIntroduction\nC# (pronounced "C sharp") is a powerful and versatile programming language developed by Microsoft in the early 2000s. Positioned as part of the .NET framework, C# has gained widespread popularity for its simplicity, strong typing, and extensive features. It has become a cornerstone for developing a variety of applications across different domains.\n\nObject-Oriented Foundation\nC# is deeply rooted in the principles of object-oriented programming (OOP). It supports encapsulation, inheritance, and polymorphism, providing developers with a robust foundation for building modular and maintainable code. The OOP paradigm in C# enhances code organization and reuse, making it well-suited for large-scale applications.\n\nSyntax and Readability\nC# boasts a clean and readable syntax, drawing inspiration from languages like C++ and Java. Its syntax is designed to be intuitive and developer-friendly, reducing the likelihood 

In [None]:
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
summarizer(text)[0]

Token indices sequence length is longer than the specified maximum sequence length for this model (818 > 512). Running this sequence through the model will result in indexing errors


{'summary_text': 'Object-Oriented Foundation C# is rooted in the principles of object-oriented programming (OOP) it supports encapsulation, inheritance, and polymorphism, providing developers with a robust foundation for building modular and maintainable code . syntax and Readability C# boasts a clean and readable syntax, drawing inspiration from languages like C++ and Java .'}