In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import tqdm as tqdm

# Importing Datasets || Adding Keywords

In [2]:
from datasets import load_dataset

ds = load_dataset("athena-ml/gpt4_short_stories_with_tokens")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens'],
        num_rows: 2119719
    })
    test: Dataset({
        features: ['text', 'tokens'],
        num_rows: 21990
    })
})

In [4]:
train_data = ds["train"]
test_data = ds["test"]

In [5]:
train_texts = train_data.select(range(500000))['text']
len(train_texts)

500000

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [9]:
import csv
from tqdm import tqdm

keywords = []
for text in tqdm(train_texts, desc="Extracting Keywords"):
    list1 = kw_model.extract_keywords(text)
    keywords.append(list1)

Extracting Keywords: 100%|██████████| 500000/500000 [3:03:52<00:00, 45.32it/s]  


In [10]:
with open("extracted_keywords2.csv", "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["Text", "Keywords"]) 
    for text, kw in zip(train_texts, keywords):
        keyword_str = ", ".join([k[0] for k in kw])
        writer.writerow([text, keyword_str])

# text preprocessing 

In [2]:
import pandas as pd

df = pd.read_csv("/kaggle/input/keywords/extracted_keywords2.csv")
df.head()

Unnamed: 0,Text,Keywords
0,"One day, a little girl named Lily found a need...","needle, sewed, lily, sew, shirt"
1,"Once upon a time, there was a little car named...","beep, beeped, fuel, car, drove"
2,"One day, a little fish named Fin was swimming ...","crab, fish, swam, fin, sun"
3,"Once upon a time, in a land full of trees, the...","cherries, cherry, trees, tree, grew"
4,"Once upon a time, there was a little girl name...","spider, cobweb, lily, cat, princess"


In [3]:
from tqdm import tqdm
tqdm.pandas()

s = df['Keywords'].progress_apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

100%|██████████| 500000/500000 [00:00<00:00, 1695032.63it/s]


In [4]:
s2 = s.apply(lambda x: x.replace(",",""))

In [5]:
df["Keywords"]=s2
df.head()

Unnamed: 0,Text,Keywords
0,"One day, a little girl named Lily found a need...",needle sewed lily sew shirt
1,"Once upon a time, there was a little car named...",beep beeped fuel car drove
2,"One day, a little fish named Fin was swimming ...",crab fish swam fin sun
3,"Once upon a time, in a land full of trees, the...",cherries cherry trees tree grew
4,"Once upon a time, there was a little girl name...",spider cobweb lily cat princess


In [None]:
import pandas as pd
from transformers import BertTokenizer
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def is_valid_text(value):
    return isinstance(value, str) or isinstance(value, list)

valid_rows = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    if is_valid_text(row['Text']) and is_valid_text(row['Keywords']):
        valid_rows.append(row)
    else:
        print(f"Invalid row at index {index}: {row}") 

df_valid = pd.DataFrame(valid_rows)

  8%|▊         | 39015/500000 [00:01<00:17, 26555.50it/s]

Invalid row at index 33810: Text        NaN
Keywords    nan
Name: 33810, dtype: object
Invalid row at index 33813: Text        NaN
Keywords    nan
Name: 33813, dtype: object
Invalid row at index 33814: Text        NaN
Keywords    nan
Name: 33814, dtype: object
Invalid row at index 33815: Text        NaN
Keywords    nan
Name: 33815, dtype: object
Invalid row at index 33816: Text        NaN
Keywords    nan
Name: 33816, dtype: object
Invalid row at index 33818: Text        NaN
Keywords    nan
Name: 33818, dtype: object
Invalid row at index 33820: Text        NaN
Keywords    nan
Name: 33820, dtype: object
Invalid row at index 33824: Text        NaN
Keywords    nan
Name: 33824, dtype: object
Invalid row at index 33828: Text        NaN
Keywords    nan
Name: 33828, dtype: object
Invalid row at index 33830: Text        NaN
Keywords    nan
Name: 33830, dtype: object
Invalid row at index 33831: Text        NaN
Keywords    nan
Name: 33831, dtype: object
Invalid row at index 33832: Text        NaN

 20%|█▉        | 97899/500000 [00:03<00:15, 26098.91it/s]

Invalid row at index 95142: Text        NaN
Keywords    nan
Name: 95142, dtype: object
Invalid row at index 95147: Text        NaN
Keywords    nan
Name: 95147, dtype: object
Invalid row at index 95148: Text        NaN
Keywords    nan
Name: 95148, dtype: object
Invalid row at index 95150: Text        NaN
Keywords    nan
Name: 95150, dtype: object
Invalid row at index 95153: Text        NaN
Keywords    nan
Name: 95153, dtype: object


 23%|██▎       | 114295/500000 [00:04<00:16, 23276.81it/s]

Invalid row at index 111176: Text        NaN
Keywords    nan
Name: 111176, dtype: object
Invalid row at index 111177: Text        NaN
Keywords    nan
Name: 111177, dtype: object
Invalid row at index 111179: Text        NaN
Keywords    nan
Name: 111179, dtype: object
Invalid row at index 111180: Text        NaN
Keywords    nan
Name: 111180, dtype: object
Invalid row at index 111181: Text        NaN
Keywords    nan
Name: 111181, dtype: object
Invalid row at index 111186: Text        NaN
Keywords    nan
Name: 111186, dtype: object
Invalid row at index 111187: Text        NaN
Keywords    nan
Name: 111187, dtype: object
Invalid row at index 111190: Text        NaN
Keywords    nan
Name: 111190, dtype: object
Invalid row at index 111191: Text        NaN
Keywords    nan
Name: 111191, dtype: object
Invalid row at index 111193: Text        NaN
Keywords    nan
Name: 111193, dtype: object
Invalid row at index 111195: Text        NaN
Keywords    nan
Name: 111195, dtype: object


 34%|███▍      | 170344/500000 [00:07<00:13, 25121.84it/s]

Invalid row at index 170284: Text        NaN
Keywords    nan
Name: 170284, dtype: object
Invalid row at index 170285: Text        NaN
Keywords    nan
Name: 170285, dtype: object
Invalid row at index 170288: Text        NaN
Keywords    nan
Name: 170288, dtype: object
Invalid row at index 170289: Text        NaN
Keywords    nan
Name: 170289, dtype: object
Invalid row at index 170291: Text        NaN
Keywords    nan
Name: 170291, dtype: object
Invalid row at index 170293: Text        NaN
Keywords    nan
Name: 170293, dtype: object
Invalid row at index 170294: Text        NaN
Keywords    nan
Name: 170294, dtype: object
Invalid row at index 170296: Text        NaN
Keywords    nan
Name: 170296, dtype: object
Invalid row at index 170300: Text        NaN
Keywords    nan
Name: 170300, dtype: object
Invalid row at index 170301: Text        NaN
Keywords    nan
Name: 170301, dtype: object
Invalid row at index 170303: Text        NaN
Keywords    nan
Name: 170303, dtype: object


 40%|████      | 201860/500000 [00:09<00:12, 24482.56it/s]

Invalid row at index 197996: Text        NaN
Keywords    nan
Name: 197996, dtype: object
Invalid row at index 198001: Text        NaN
Keywords    nan
Name: 198001, dtype: object
Invalid row at index 198003: Text        NaN
Keywords    nan
Name: 198003, dtype: object
Invalid row at index 198004: Text        NaN
Keywords    nan
Name: 198004, dtype: object
Invalid row at index 198005: Text        NaN
Keywords    nan
Name: 198005, dtype: object
Invalid row at index 198008: Text        NaN
Keywords    nan
Name: 198008, dtype: object


 44%|████▍     | 219621/500000 [00:09<00:11, 25281.86it/s]

Invalid row at index 215999: Text        NaN
Keywords    nan
Name: 215999, dtype: object


 72%|███████▏  | 360927/500000 [00:16<00:05, 25794.71it/s]

Invalid row at index 357531: Text        NaN
Keywords    nan
Name: 357531, dtype: object
Invalid row at index 357538: Text        NaN
Keywords    nan
Name: 357538, dtype: object
Invalid row at index 357539: Text        NaN
Keywords    nan
Name: 357539, dtype: object
Invalid row at index 357540: Text        NaN
Keywords    nan
Name: 357540, dtype: object
Invalid row at index 357545: Text        NaN
Keywords    nan
Name: 357545, dtype: object
Invalid row at index 357547: Text        NaN
Keywords    nan
Name: 357547, dtype: object
Invalid row at index 357549: Text        NaN
Keywords    nan
Name: 357549, dtype: object
Invalid row at index 357554: Text        NaN
Keywords    nan
Name: 357554, dtype: object


 97%|█████████▋| 483688/500000 [00:22<00:00, 22815.54it/s]

Invalid row at index 479920: Text        NaN
Keywords    nan
Name: 479920, dtype: object


100%|██████████| 500000/500000 [00:23<00:00, 21448.46it/s]


In [8]:
df_valid.head()

Unnamed: 0,Text,Keywords
0,"One day, a little girl named Lily found a need...",needle sewed lily sew shirt
1,"Once upon a time, there was a little car named...",beep beeped fuel car drove
2,"One day, a little fish named Fin was swimming ...",crab fish swam fin sun
3,"Once upon a time, in a land full of trees, the...",cherries cherry trees tree grew
4,"Once upon a time, there was a little girl name...",spider cobweb lily cat princess


# model Training

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import random

In [None]:
# EMBED_SIZE = 256
# HIDDEN_SIZE = 512
# BATCH_SIZE = 16
# NUM_EPOCHS = 10
# MAX_LEN = 64
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-04-22 12:21:56.945145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745324517.133409      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745324517.197785      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = examples['Keywords']
    targets = examples['Text']
    
    tokenized_inputs = []
    tokenized_labels = []
    attention_masks = []  
    for inp, tgt in tqdm(zip(inputs, targets), total=len(inputs), desc="Tokenizing batch"):
        tokenized_inp = tokenizer(inp, max_length=32, truncation=True, padding="max_length")
        tokenized_tgt = tokenizer(tgt, max_length=256, truncation=True, padding="max_length")
        
        tokenized_inputs.append(tokenized_inp["input_ids"])
        attention_masks.append(tokenized_inp["attention_mask"])  
        tokenized_labels.append(tokenized_tgt["input_ids"])
    
    return {
        "input_ids": tokenized_inputs,
        "attention_mask": attention_masks,
        "labels": tokenized_labels
    }


In [12]:
tokenized_train_dataset = preprocess_function(df_valid[:450000])
tokenized_eval_dataset = preprocess_function(df_valid[450000:])

Tokenizing batch: 100%|██████████| 450000/450000 [06:49<00:00, 1100.04it/s]
Tokenizing batch: 100%|██████████| 49945/49945 [00:48<00:00, 1030.64it/s]


In [48]:
type(tokenized_train_dataset)

dict

In [13]:
from datasets import Dataset

train_dataset = Dataset.from_dict(tokenized_train_dataset)

In [14]:
eval_dataset = Dataset.from_dict(tokenized_eval_dataset)

In [15]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 450000
})

In [52]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 49945
})

In [None]:
from transformers import Trainer, TrainingArguments
from tqdm import tqdm

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none", 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

batch_size = 8  
train_loader = DataLoader(
    tokenized_train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,       
    pin_memory=True      
)

for batch in tqdm(train_loader, desc="Training Batches", leave=False):
    pass

hello


# Uploading To Hugging_Face

* Vasusurii/Shortstories-with-keywords-dataset

In [20]:
from datasets import load_dataset, Dataset
from huggingface_hub import login

login(token="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")

train_dataset.push_to_hub("Vasusurii/Shortstories-with-keywords-dataset")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/225 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/225 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Vasusurii/Shortstories-with-keywords-dataset/commit/849e6bfb2ae1c5f66e472b2a65379fb590defb95', commit_message='Upload dataset', commit_description='', oid='849e6bfb2ae1c5f66e472b2a65379fb590defb95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Vasusurii/Shortstories-with-keywords-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Vasusurii/Shortstories-with-keywords-dataset'), pr_revision=None, pr_num=None)

* Vasusurii/shortstories_eval_dataset

In [1]:
from datasets import load_dataset, Dataset
from huggingface_hub import login

login(token="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")

eval_dataset.push_to_hub("Vasusurii/shortstories_eval_dataset")

NameError: name 'eval_dataset' is not defined

In [None]:
import random

word_list = ["hello", "bye"]

prompt = " ".join(random.sample(word_list, min(len(word_list), 2)))
prompt = prompt + " " + " ".join(random.sample(word_list, min(len(word_list), 2)))

print(prompt)


hello byevasu hello bye
