In [1]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.20.2-py3-none-any.whl (330 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.3/330.3 KB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock
  Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting regex!=2019.12.17
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 KB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━

In [2]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the DailyDialog dataset
dataset = load_dataset('daily_dialog')

Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.61M/3.61M [00:00<00:00, 5.69MB/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 334k/334k [00:00<00:00, 1.36MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 331k/331k [00:00<00:00, 677kB/s]
Generating train split: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11118/11118 [00:00<00:00, 105018.18 examples/s]
Generating validation split: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 99308.72 examples/s]
Generating test split: 100%|████████████████████████████████

In [4]:
raw_train_dataset = dataset["train"]
raw_train_dataset[0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

In [5]:
# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_utterances(example):
    example['dialog'] = " ".join(example['dialog'])
    return example

# Apply the function to all examples in the dataset
dataset = dataset.map(concatenate_utterances)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11118/11118 [00:01<00:00, 9646.18 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 12063.38 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 11943.46 examples/s]


In [8]:
dataset["train"][0]

{'dialog': "Say , Jim , how about going for a few beers after dinner ?   You know that is tempting but is really not good for our fitness .   What do you mean ? It will help us to relax .   Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?   I guess you are right.But what shall we do ? I don't feel like sitting at home .   I suggest a walk over to the gym where we can play singsong and meet some of our friends .   That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .   Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .   Good.Let ' s go now .   All right . ",
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

# make my own dataset

In [None]:
# https://huggingface.co/learn/nlp-course/chapter5/5

## scrapping the datasets

In [9]:
pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.9.0-py3-none-any.whl (12.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting distro
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.9.0 tabula-py-2.9.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import tabula as tb

In [None]:
path = "data/0000400206.pdf"
data = tb.read_pdf(path, area = (300, 0, 600, 800), pages = Ƈ')

In [11]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.10.3-py3-none-any.whl (48 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.0/49.0 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdfium2>=4.18.0
  Downloading pypdfium2-4.26.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting pdfminer.six==20221105
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting Pillow>=9.1
  Downloading pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m48.9 MB/s[0m et

In [53]:
import pdfplumber
path = "data/0000400206.pdf"
# path = "data/DL320_2002.pdf"
with pdfplumber.open(path) as pdf:
    first_page = pdf.pages
    print(first_page[0].chars[0])

{'matrix': (11, 0, 0, 11, 39.5272, 770.0113), 'fontname': 'LGNJCG+Arial-BoldMT', 'adv': 0.722, 'upright': True, 'x0': 39.5272, 'y0': 767.6903, 'x1': 47.4692, 'y1': 778.6903, 'width': 7.942, 'height': 11.0, 'size': 11.0, 'mcid': None, 'tag': None, 'object_type': 'char', 'page_number': 1, 'ncs': 'DeviceRGB', 'text': 'N', 'stroking_color': (0, 0, 0), 'stroking_pattern': None, 'non_stroking_color': (0, 0, 0), 'non_stroking_pattern': None, 'top': 63.30970000000002, 'bottom': 74.30970000000002, 'doctop': 63.30970000000002}


In [55]:
"".join([e["text"] for e in first_page[0].chars])

'N.º 20 29 de janeiro de 2021 Pág. 4Diário da República, 1.ª série PRESIDÊNCIA DO CONSELHO DE MINISTROSDecreto-Lei n.º 9/2021de 29 de janeiroSumário: Aprova o Regime Jurídico das Contraordenações Económicas.No interesse da maximização do bem -estar, da segurança e da proteção dos direitos dos con-sumidores, para o regular funcionamento dos mercados e a competitividade da economia e para a promoção da concorrência, é exigida a intervenção reguladora do Estado.Com o objetivo de promover e defender esses valores, tem vindo a assistir -se a uma disper-são legislativa resultante da multiplicação de legislação enquadradora e reguladora do acesso e do exercício de atividades económicas, a qual, cominando com coima a violação das respetivas disposições, constitui -se como uma fonte de direito contraordenacional em matéria económica.Não obstante o enquadramento comum fixado no Regime Geral do Ilícito de Mera Ordena-ção Social, constante do Decreto -Lei n.º 433/82, de 27 de outubro, alterado pel

In [64]:
import pdfplumber
path = "data/0000400206.pdf"
path = "data/DL320_2002.pdf"
with pdfplumber.open(path) as pdf:
    pages = pdf.pages

    text = pages[0].extract_text_simple(x_tolerance=3, y_tolerance=3)
    text1 = pages[0].extract_text(x_tolerance=3, y_tolerance=3, layout=False, x_density=7.25, y_density=13)


In [65]:
text1

'8160 DIÁRIODAREPÚBLICA—ISÉRIE-A N.o300—28deDezembrode2002\nCAPÍTULOIV Maria Manuela Dias Ferreira Leite—Carlos Manuel\nTavaresdaSilva.\nDisposiçõesfinaisetransitórias\nPromulgadoem20deDezembrode2002.\nArtigo49.o\nPublique-se.\nDisposiçõesrevogadas\nOPresidentedaRepública,JORGESAMPAIO.\n1 — Revoga-se expressamente o Decreto-Lei\nn.o 433/91, de 7 de Novembro, alterado pelos Decre- Referendadoem20deDezembrode2002.\ntos-Leis n.os 175/94, de 27 de Junho, e 230/98, de 22\nO Primeiro-Ministro, José Manuel Durão Barroso.\nde Julho, o Decreto-Lei n.o 58/99, de 2 de Março, e\na alínea p) do n.o 1.o da Portaria n.o 95/94, de 9 de\nFevereiro.\n2—Asremissõesfeitasparapreceitosrevogadospor\nMINISTÉRIODASCIDADES,ORDENAMENTO\neste decreto-lei devem entender-se como substituídas\nDOTERRITÓRIOEAMBIENTE\nporremissõesfeitasparaascorrespondentesdisposições\ndestediploma.\nDecreto-Lein.o320/2002\nArtigo50.o\nde28deDezembro\nDisposiçõestransitórias\nNas últimas décadas, a opção por edifícios de habi-\n1—As 

![image.png](attachment:e9b61e21-1774-47c4-99d7-0d22d065b967.png)

https://github.com/jsvine/pdfplumber/issues/244