In [None]:
import pandas as pd
import numpy as np
import re

## Data Preprocessing

In [None]:
def clean(s):

    s = str(s)
    s = re.sub('\s\W', ' ', s)
    s = re.sub('\W,]s', ' ', s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+', ' ', s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co", "")
    s = s.replace("https", "")
    s = s.replace("[\w*", " ")
    return s
    

In [None]:
df = pd.read_csv('/kaggle/input/news-articles/Articles.csv', encoding='ISO-8859-1')
df = df.dropna()
text_data = open('Articles.txt', 'w')
for idx, item in df.iterrows():
    article = clean(item["Article"])
    text_data.write(article)
text_data.close()

## Model Training

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size
    )
    return dataset

def load_data_collator(tokenizer, mlm=False):

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm)
    return data_collator

def train(train_file_path, model_name, output_dir, 
          overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):


    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
    
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(output_dir = output_dir, overwrite_output_dir=overwrite_output_dir,
                                     per_device_train_batch_size = per_device_train_batch_size,
                                     num_train_epochs = num_train_epochs)
    
    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator=data_collator,
        train_dataset=train_dataset
    )

    trainer.train()
    trainer.save_model()

In [None]:
train_file_path = '/kaggle/working/Articles.txt'

model_name = 'gpt2'

output_dir = '/kaggle/working'

overwrite_output_dir = False

per_device_train_batch_size = 8

num_train_epochs = 5.0

save_steps = 500

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

## Model Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):

    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):

    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(sequence, max_length):

    model_path = '/kaggle/working'
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))


    

    
    

In [42]:
sequence = "Japanese yen today"
max_len = 100
generate_text(sequence, max_len)

Japanese yen today is also down to $1.099 on the S&P 500.

For every dollar lost a year ago, that dollar gain could rise further. As the U.S. market shrinks, Japanese companies, which use the yen to secure investments, could gain a foothold. That kind of money has more potential to boost corporate profits, since the yen is no longer a currency denominated in U.S. dollars.

However, the yen still has plenty
