In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip "/content/drive/MyDrive/Articles.csv.zip"

Archive:  /content/drive/MyDrive/Articles.csv.zip
  inflating: Articles.csv            


In [4]:
def cleaning(s):
    s = str(s)  # Ensure the input is a string
    s = re.sub(r'\s\W', ' ', s)  # Replace whitespace followed by a non-word character with a space
    s = re.sub(r'\W,\s', ' ', s)  # Replace a non-word character followed by a comma and whitespace with a space
    s = re.sub(r"\d+", "", s)  # Remove all digits
    s = re.sub(r'\s+', ' ', s)  # Replace multiple whitespace characters with a single space
    s = re.sub(r'[!@#$_]', '', s)  # Remove specific special characters (!, @, #, $, _)
    s = s.replace("co", "")  # Remove occurrences of the substring "co"
    s = s.replace("https", "")  # Remove occurrences of the substring "https"
    s = s.replace("[\w*", " ")  # Replace the literal string "[\w*" with a space
    return s  # Return the cleaned string


In [5]:
df = pd.read_csv("/content/Articles.csv", encoding="ISO-8859-1")
df = df.dropna()

In [6]:
df.sample(5)

Unnamed: 0,Article,Date,Heading,NewsType
172,Singapore: Oil prices fell in Asia Monday afte...,6/8/2015,oil down after opec meeting eyes on iran,business
32,London: Oil prices fell further Thursday as th...,2/5/2015,oil prices extend,business
2254,LONDON: Pakistan´s Yasir Shah made a useful un...,7/16/2016,Yasir stars with bat despite Woakes heroi,sports
2248,strong>LORDS: Pakistan were dismissed for 339 ...,7/15/2016,England dismiss Pakistan for 339 in first inning,sports
267,KARACHI: A strike called by the Oil Tanker Own...,9/1/2015,petrol shortage in several parts of paki,business


In [7]:
text_data = open("/content/Articles.txt", 'w')
for idx, item in df.iterrows():
  article = cleaning(item["Article"])
  text_data.write(article)
text_data.close()

In [8]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [9]:
# Loading Dataset

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [10]:
# Loading Data Collator

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [11]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [12]:
# Setting parameters

train_file_path = "/content/Articles.txt"
model_name = 'gpt2'
output_dir = '/content/results'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3.0
save_steps = 500

In [13]:
# Training
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss
500,3.6909
1000,3.4251
1500,3.1911
2000,3.1443
2500,3.0392
3000,3.0155


In [14]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [15]:
# Testing

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/results"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [16]:
sequence = input()
max_len = int(input())
generate_text(sequence, max_len)

oil
20


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


oil will help to reduce mmercial gas imports as well as support the enomy.The government has
