In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("ag_news")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
filtered_datasets = raw_datasets.filter(lambda example: example["label"] == 2)
filtered_datasets = filtered_datasets.remove_columns("label")

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForCausalLM,AutoTokenizer
import torch 

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token=(
    tokenizer.eos_token
)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
def tokenize_function(batch):
    return tokenizer(batch["text"],truncation = True)

tokenized_datasets = filtered_datasets.map(tokenize_function,batched = True,remove_columns = ["text"])

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1900 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1900
    })
})

In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer,mlm = False)

In [8]:
samples = [tokenized_datasets["train"][i] for i in range(3)]

for sample in samples:
    print(f"input_ids shape: { len(sample['input_ids'])}")

input_ids shape: 37
input_ids shape: 55
input_ids shape: 51


In [9]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 55])
attention_mask shape: torch.Size([3, 55])
labels shape: torch.Size([3, 55])


In [10]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    "business-news-generator",
    push_to_hub = True,
    per_device_train_batch_size=8,
    weight_decay = 0.1,
    lr_scheduler_type = "cosine",
    learning_rate = 5e-4,
    num_train_epochs = 2,
    eval_strategy = "steps",
    eval_steps = 200,
    logging_steps = 200,
    
)

In [11]:

from transformers import Trainer

trainer = Trainer(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_datasets["train"].select(range(5000)),
    eval_dataset = tokenized_datasets["test"]
)

In [12]:
trainer.train()

Step,Training Loss,Validation Loss
200,3.6549,3.595136
400,3.3091,3.483602
600,3.1019,3.363487
800,2.1456,3.465546
1000,1.9671,3.410426
1200,1.9107,3.403467


TrainOutput(global_step=1250, training_loss=2.6515588439941404, metrics={'train_runtime': 141.2697, 'train_samples_per_second': 70.787, 'train_steps_per_second': 8.848, 'total_flos': 467451445248000.0, 'train_loss': 2.6515588439941404, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

In [2]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("business-news-generator/checkpoint-1250")
tokenizer = AutoTokenizer.from_pretrained("business-news-generator/checkpoint-1250")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at business-news-generator/checkpoint-1250 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from huggingface_hub import HfApi, HfFolder

# Set the repository name. It will be created under your username on Hugging Face Hub.
repo_name = "business_news_generator"

# Upload the model and tokenizer
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rahul004/business_news_generator/commit/83fd8d523ae459fab6d62f5f950c7da90d11c5f0', commit_message='Upload tokenizer', commit_description='', oid='83fd8d523ae459fab6d62f5f950c7da90d11c5f0', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
from transformers import pipeline
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

pipe = pipeline("text-generation",model = "rahul004/business_news_generator",device = device )

pipe.tokenizer.pad_token_id = 50256

print(pipe("Q1",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])
print(pipe("WALL",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])
print(pipe("Google",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Q1 profit almost doubles on record sales WASHINGTON -- U.S. wine and spirits group H2 earnings rose 8 percent in 2003 as demand for more robust international wines and spirits boosted purchases for beer and imported spirits.  quot;In... less
WALLEN THE FUTURES NEWS: Australia #39;s two leading banks have agreed to pay \$75 million back a \$23 million grant to settle allegations of misconduct in their investment banks.  #39;The settlement with the
Google faces scrutiny in China IPO BANGKOK: Google Inc. faces civil scrutiny this month in China, with a key international antitrust court ordering it to go public on the Internet.  The Securities and Exchange Commission has reportedly ordered that Web search engine


In [5]:
print(pipe("Q1",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

Q1 profit slips, shares slide BHP Billiton, the world #39;s biggest miner, said its first-half profit slipped nearly 30 percent after a massive backtax bill lifted pressure on the firm, sending shares down more than 20 percent


In [6]:
print(pipe("WALL",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

WALL STOCKS LOS ANGELES, Aug. 19 : Stent maker Boston Scientific Corporation (BSX.N: Quote, Profile, Research) was founded by the top US maker of Botox chips, makers of gels and


In [7]:
print(pipe("Google",pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

Google #39;s going gold right Now that we can all #39;t seem to agree on pace or price, but Google #39;s going gold right now. It #39;s going to be a decade or more, with
