In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-fine-food:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4839828%2F8176199%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240422%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240422T121600Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9f0d3e032c52e7d5029f3a84be9293be574eefe7588ef707b134cec7f4d31f3d990ebb4081d1e31a2c5bba7067c455498b4153249641efd85b9e90f2a459331513ddb996a08eda0261a19078bc80bd01d5f1f6b96640a3e13ddb7e9f2543e82d98dc36592ede02badf6fc097f9a472ff616223f81236321c39eeee845c5a65754fa380c54150603167015e9b162cd66e0bb5df03cba36dc47e747bbc6f430474d1ad255f4d3192f8035a2705d47bca4df81ed6c579da42eea21cba573ce0f186afaa7877003d37221ef619594888520c6054de130059fea7214d5952533c6748397e35f3ec3912a6f38d37ed21508cafb62544671e54d3b202faec5423bf9bd6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s + '\nTL;DR:\n'

In [None]:
df = pd.read_csv("/kaggle/input/amazon-fine-food/Reviews.csv", encoding="ISO-8859-1")
df = df.dropna(subset = ['Text', 'Summary'])
df['Text'] = df['Text'].apply(lambda x: cleaning(x))

In [None]:
!pip install transformers



In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
subdf = df.head(30000)
subdf

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a nfection that has been around a few ...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wide...
...,...,...,...,...,...,...,...,...,...,...
29995,29996,B000DZFMEQ,AU714FVNMGW4E,"Anita L. Burnham ""Anita B""",1,1,5,1253750400,Gluten Free Goodness,This is the best gf bread mix I have found by ...
29996,29997,B000DZFMEQ,A3CZKBRQYTW7W0,Denise Estep,1,1,5,1252713600,GLUTEN FREE BREAD,THIS BREAD MIX IS THE CLOSEST THING TO REGULAR...
29997,29998,B000DZFMEQ,A2LQTTTXBLFFAO,"lovereading ""Gin""",1,1,5,1250208000,Pamela's bread mix,Delicious and easy to make. An excellent bread...
29998,29999,B000DZFMEQ,A2A0UL2OFEIPH4,"Jo ""Jo""",1,1,5,1248048000,Great bread!,"I bought this mix for my daughter's boyfriend,..."


In [None]:
from sklearn.model_selection import train_test_split
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
training_data, testing_data = train_test_split(subdf, test_size = 0.25, random_state = 42)


In [None]:
from torch.utils.data import Dataset

class DatasetTokenizer(Dataset):
    def __init__(self, tokenizer, dataframe, max_length):
        if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
        if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def ReturnStatement(self, input_ids, attention_mask):
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids
        }

    def flattenvectors(self, lst, inputs):
        input_ids = None
        attention_mask = None
        labels = None
        for item in lst:
            if item == 'input_ids' or item == 'labels':
                item = 'input_ids'
                input_ids = inputs[item].flatten()
                labels = inputs[item].flatten()
            else:
                attention_mask = inputs[item].flatten()
        return input_ids, attention_mask, labels

    def _encode(self, review, summary):
        inputs = self.tokenizer.encode_plus(
            review, summary, add_special_tokens=True,
            max_length=self.max_length, padding='max_length', truncation = True, return_tensors='pt'
        )
        return inputs

    def __getitem__(self, index):
        review = str(self.data.iloc[index]['Text'])
        summary = str(self.data.iloc[index]['Summary'])
        inputs = self._encode(review, summary)
        input_ids, attention_mask, labels = self.flattenvectors(['input_ids', 'attention_mask', 'labels'], inputs)
        return self.ReturnStatement(input_ids, attention_mask)

In [None]:
from transformers import Trainer, TrainingArguments

def set_training_arguments():
    _output_dir='content/results'
    _num_train_epochs=3
    _per_device_train_batch_size=4
    _per_device_eval_batch_size=8
    _warmup_steps=500
    _weight_decay=0.01
    _logging_dir='content/logs'
    _evaluation_strategy='epoch'
    return TrainingArguments(
        output_dir=_output_dir,
        num_train_epochs=_num_train_epochs,
        per_device_train_batch_size=_per_device_train_batch_size,
        per_device_eval_batch_size=_per_device_eval_batch_size,
        warmup_steps=_warmup_steps,
        weight_decay=_weight_decay,
        logging_dir=_logging_dir,
        evaluation_strategy=_evaluation_strategy
    )

def create_trainer(model, train_dataset, test_dataset, training_args):
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

def train_model(trainer):
    trainer.train("/kaggle/working/content/results/checkpoint-7000")

def train_model_with_parameters(model, train_dataset, test_dataset):
    training_args = set_training_arguments()
    trainer = create_trainer(model, train_dataset, test_dataset, training_args)
    train_model(trainer)

# Assuming DatasetTokenizer is a custom class for tokenizing datasets
train_dataset = DatasetTokenizer(tokenizer, training_data, max_length=512)
test_dataset = DatasetTokenizer(tokenizer, testing_data, max_length=512)
train_model_with_parameters(model, train_dataset, test_dataset)

In [None]:
model.save_pretrained("model")