<a href="https://www.kaggle.com/code/subram/formality-style-transfer-finetuned?scriptVersionId=208471382" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
file_path = '/kaggle/input/gyafcdatae/Entertainment_Music/train/informal'   # Replace 'your_text_file.txt' with the path to your text file
with open(file_path, 'r') as file:
    X_train = file.readlines()

for i  in range(len(X_train)):
  X_train[i] = X_train[i].split("\n")[0]
  X_train[i] = X_train[i].lower()

In [None]:
print(len(X_train))
X_train[30000:30010]

In [None]:
# Read text from a file
file_path = '/kaggle/input/gyafcdatae/Entertainment_Music/train/formal'
with open(file_path, 'r') as file:
    y_train = file.readlines()

for i  in range(len(y_train)):
  y_train[i] = y_train[i].split("\n")[0]
  y_train[i] = y_train[i].lower()

In [None]:
print(len(y_train))
y_train[30000:30010]

In [None]:
# Read text from a file
file_path = '/kaggle/input/gyafcdatae/Entertainment_Music/test/informal'
with open(file_path, 'r') as file:
    X_test = file.readlines()

for i  in range(len(X_test)):
  X_test[i] = X_test[i].split("\n")[0]
  X_test[i] = X_test[i].lower()

print(len(X_test))
X_test[:10]

In [None]:
# Read text from a file
file_path = '/kaggle/input/gyafcdatae/Entertainment_Music/test/formal.ref2'
with open(file_path, 'r') as file:
    y_test = file.readlines()

for i  in range(len(y_test)):
  y_test[i] = y_test[i].split("\n")[0]
  y_test[i] = y_test[i].lower()

print(len(y_test))
y_test[:10]


In [None]:
from torch.utils.data import Dataset
import torch

class ChatData(Dataset):
    def __init__(self, tokenizer, X_train, y_train, subset_range=(0, None), max_length=64):
        """
        Initializes the dataset for fine-tuning GPT-2.
        
        Args:
        - tokenizer: The tokenizer to use for encoding data.
        - X_train: List of informal sentences.
        - y_train: List of corresponding formal sentences.
        - subset_range: Tuple specifying the range of data to use (start_idx, end_idx).
        - max_length: Maximum length for tokenization (default=64).
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.X = X_train
        self.y = y_train
        
        # Apply subset range if specified
        start, end = subset_range
        end = end if end is not None else len(self.X)
        self.X = self.X[start:end]
        self.y = self.y[start:end]
        
        # Format the data with prefixes and suffixes
        formatted_data = []
        for idx, (i, target) in enumerate(zip(self.X, self.y)):
            try:
                formatted_data.append(
                    f"<|startoftext|>\n[Informal]: {i}\n [Formal]: {target} <|endoftext|>"
                )
            except Exception as e:
                print(f"Error processing index {idx}: {e}")
        
        self.X = formatted_data
        print(f"Sample input after formatting: {self.X[0]}")  # Debugging: print first example

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        """
        Tokenizes the sample on-the-fly to avoid memory issues.
        """
        text = self.X[idx]
        encoded = self.tokenizer(
            text, 
            max_length=self.max_length, 
            truncation=True, 
            padding="max_length", 
            return_tensors="pt"
        )
        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)
        return input_ids, attention_mask


In [None]:
!pip install --upgrade jupyterlab jupyterlab_widgets

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

# Training function
def train(chatData, model, tokenizer, optim, device):
    epochs = 20
    for epoch in tqdm.tqdm(range(epochs), desc="Training"):
        model.train()  # Ensure model is in training mode
        total_loss = 0
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            
            optim.zero_grad()
            outputs = model(X, attention_mask=a, labels=X)
            loss = outputs.loss
            loss.backward()
            optim.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(chatData):.4f}")

        # Save tokenizer and model configurations
        tokenizer.save_pretrained("tokenizer_configs")
        model.save_pretrained("model_configs")
        # torch.save(model.state_dict(), f"model_state_epoch_{epoch + 1}.pt")
        torch.save(model.state_dict(), "model_state.pt")
        
        # Run inference for debugging
        print("Example output:", infer('Damn,u look fine!', tokenizer, model, device))

# Inference function
def infer(inp, tokenizer, model, device):
    inp = f"<|startoftext|>\n[Informal]: {inp}\n [Formal]: "
    encoded = tokenizer(
        inp, 
        return_tensors="pt", 
        max_length=30, 
        truncation=True, 
        padding="max_length"
    )
    X = encoded["input_ids"].to(device)
    a = encoded["attention_mask"].to(device)
    
    output = model.generate(
        X, attention_mask=a, max_new_tokens=10)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return output_text

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer setup
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<|startoftext|>",
    "eos_token": "<|endoftext|>"
})
tokenizer.add_tokens(["[Informal]:", "[Formal]:"])

# Model setup
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))  # Adjust for new tokens
model = model.to(device)

# Dataset and DataLoader
chatData = ChatData(tokenizer, X_train, y_train, subset_range=(0, len(X_train)), max_length=30)
chatDataLoader = DataLoader(chatData, batch_size=16, shuffle=True)

# Optimizer
optim = Adam(model.parameters(), lr=1e-4)

# Training
print("Starting training...")
train(chatDataLoader, model, tokenizer, optim, device)

In [4]:
import os
import shutil

# Ensure the destination directory exists
kaggle_config_dir = os.path.expanduser('~/.config/kaggle/')
os.makedirs(kaggle_config_dir, exist_ok=True)

# Move the kaggle.json file
shutil.copy('/kaggle/input/kaggles/kaggle.json', os.path.join(kaggle_config_dir, 'kaggle.json'))

print("kaggle.json file moved successfully!")


kaggle.json file moved successfully!


In [5]:
!kaggle kernels output subram/notebook6583ea48bc -p /kaggle/working

Output file downloaded to /kaggle/working/model_configs/config.json
Output file downloaded to /kaggle/working/model_configs/generation_config.json
Output file downloaded to /kaggle/working/model_configs/model.safetensors
Output file downloaded to /kaggle/working/model_state.pt
Output file downloaded to /kaggle/working/tokenizer_configs/added_tokens.json
Output file downloaded to /kaggle/working/tokenizer_configs/merges.txt
Output file downloaded to /kaggle/working/tokenizer_configs/special_tokens_map.json
Output file downloaded to /kaggle/working/tokenizer_configs/tokenizer_config.json
Output file downloaded to /kaggle/working/tokenizer_configs/vocab.json
Kernel log downloaded to /kaggle/working/notebook6583ea48bc.log 


In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/kaggle/working/tokenizer_configs")
model = GPT2LMHeadModel.from_pretrained("/kaggle/working/model_configs")

In [7]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


In [8]:
import torch

def infer(inp, tokenizer, model, device):
    inp = f"<|startoftext|>\n[Informal]: {inp}\n [Formal]: "
    encoded = tokenizer(
        inp, 
        return_tensors="pt", 
        max_length=30, 
        truncation=True, 
        padding="max_length"
    )
    X = encoded["input_ids"].to(device)
    a = encoded["attention_mask"].to(device)
    
    output = model.generate(
        X, attention_mask=a, max_new_tokens=10)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return output_text

# Ensure proper device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Move model to the correct device


In [17]:
inp = input()
print(infer(inp, tokenizer, model, device))
# print(infer(inp))

 U are rude dude


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 [Informal]:  U are rude dude
  [Formal]:   you are rude, my friend. 
