In [1]:
import csv
import os
import argparse

In [2]:
import torch
torch.cuda.empty_cache()

In [3]:




from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

class MyDataset(Dataset):
	def __init__(self, data_file_name, data_dir='.data/'):
		super().__init__()

		data_path = os.path.join(data_file_name)

		self.data_list = []
		self.end_of_text_token = " <|endoftext|> "
		
		with open(data_path) as csv_file:
			csv_reader = csv.reader(csv_file, delimiter='\t')
			
			for row in csv_reader:
				data_str = f"{row[0]}: {row[1]}{self.end_of_text_token}"
				self.data_list.append(data_str)
		
	def __len__(self):
		return len(self.data_list)

	def __getitem__(self, item):
		return self.data_list[item]

def get_data_loader(data_file_name):
	dataset = MyDataset(data_file_name)
	data_loader = DataLoader(dataset, batch_size=1, shuffle=True)
	return data_loader

def save_model(model, name):
	"""
	Summary:
		Saving model to the Disk
	Parameters:
		model: Trained model object
		name: Name of the model to be saved
	"""
	print ("Saving model to Disk")
	torch.save(model.state_dict(), f"{name}.pt")
	return

def load_models():
	"""
	Summary:
		Loading Pre-trained model
	"""
	print ('Loading/Downloading GPT-2 Model')
	tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
	model = GPT2LMHeadModel.from_pretrained('distilgpt2')
	return tokenizer, model






def train(epochs, data_loader, batch_size, tokenizer, model, device):	
	batch_counter = 0
	sum_loss = 0.0

	for epoch in range(epochs):
		print (f'Running {epoch+1} epoch')

		for idx, txt in enumerate(data_loader):
			txt = torch.tensor(tokenizer.encode(txt[0]))
			txt = txt.unsqueeze(0).to(device)
			outputs = model(txt, labels=txt)
			loss, _ = outputs[:2]
			loss.backward()
			sum_loss += loss.data

			if idx%batch_size==0:
				batch_counter += 1
				optimizer.step()
				scheduler.step()
				optimizer.zero_grad()
				model.zero_grad()

			if batch_counter == 10:
				print(f"Total Loss is {sum_loss}") #printed after every 10*batch_size
				batch_counter = 0
				sum_loss = 0.0

	return model


In [4]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [5]:

#! TRAIN

epoch = 3
warmup = 300
model_name = 'mymodel'
data_file = 'SMSSpamCollection'
batch = 32
# batch = 1
learning_rate = 3e-5
max_len = 200
# max_len = 50

#?
# torch.cuda.empty_cache()


BATCH_SIZE = batch
EPOCHS = epoch
LEARNING_RATE = learning_rate
WARMUP_STEPS = warmup
MAX_SEQ_LEN = max_len
MODEL_NAME = model_name
DATA_FILE = data_file

TOKENIZER, MODEL = load_models()
LOADER = get_data_loader(DATA_FILE)


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



model = MODEL.to(DEVICE)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

model = train(EPOCHS, LOADER, BATCH_SIZE, TOKENIZER, MODEL, DEVICE)
save_model(model, MODEL_NAME)


Loading/Downloading GPT-2 Model
Running 1 epoch
Total Loss is 1873.1612548828125
Total Loss is 2096.28466796875
Total Loss is 2023.836669921875
Total Loss is 1943.64697265625
Total Loss is 1903.0921630859375
Total Loss is 1806.6973876953125
Total Loss is 1745.9862060546875
Total Loss is 1707.542724609375
Total Loss is 1627.1785888671875
Total Loss is 1597.7047119140625
Total Loss is 1552.2327880859375
Total Loss is 1482.6981201171875
Total Loss is 1468.332275390625
Total Loss is 1437.518310546875
Total Loss is 1433.548095703125
Total Loss is 1395.8909912109375
Total Loss is 1398.34814453125
Running 2 epoch
Total Loss is 1275.405517578125
Total Loss is 1378.5164794921875
Total Loss is 1380.6341552734375
Total Loss is 1378.30615234375
Total Loss is 1343.72607421875
Total Loss is 1329.816650390625
Total Loss is 1336.9169921875
Total Loss is 1339.1822509765625
Total Loss is 1327.9140625
Total Loss is 1354.6986083984375
Total Loss is 1324.5086669921875
Total Loss is 1322.94921875
Total Loss

In [6]:
torch.cuda.memory_summary(device=None, abbreviated=False)

