In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pip install transformers datasets torch fastapi uvicorn



In [4]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.32.1


In [5]:
#Data Preprocesssing:
import os
import re
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            # Encodes the input text to tokenId
            token_ids = tokenizer.encode(line, add_special_tokens=False)
            # Convert token IDs back to tokens
            tokenized_line = tokenizer.convert_ids_to_tokens(token_ids)
            # Convert tokens to text and remove special tokens
            processed_line = " ".join(tokenized_line).replace('Ġ', '').replace('Ċ', '').replace('�', '').strip()

            processed_line = re.sub(r'[^a-zA-Z0-9\s,():%&]', '', processed_line)
            processed_line = re.sub(r'\bgu vi\b', 'guvi', processed_line)
            # Remove extra spaces
            processed_line = re.sub(r'\s+', ' ', processed_line)
            # Write the processed line to the output file
            f.write(processed_line + "\n")


# Make sure this path is correct
input_file = "/content/drive/MyDrive/guvi_data.txt"
output_file = "processed_guvidata.txt"
preprocess_data(input_file, output_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
#Fine tuning:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

train_dataset = load_dataset(output_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Finetune the model
trainer.train()

# Save the finetuned model and tokenizer
model.save_pretrained("/content/drive/My Drive/finetuned_model271230")
tokenizer.save_pretrained("/content/drive/My Drive/finetuned_model271230")



Step,Training Loss
500,3.0308


('/content/drive/My Drive/finetuned_model271230/tokenizer_config.json',
 '/content/drive/My Drive/finetuned_model271230/special_tokens_map.json',
 '/content/drive/My Drive/finetuned_model271230/vocab.json',
 '/content/drive/My Drive/finetuned_model271230/merges.txt',
 '/content/drive/My Drive/finetuned_model271230/added_tokens.json')

In [11]:
#Testing:

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the finetuned model and tokenizer
model_name_or_path =  "/content/drive/My Drive/finetuned_model271230"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
token_name_or_path = "/content/drive/My Drive/finetuned_model271230"
tokenizer = GPT2Tokenizer.from_pretrained(token_name_or_path)


# Set the pad_token to eos_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text with padding
    inputs = tokenizer(seed_text, return_tensors='pt', padding=True, truncation=True)

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.50,
            pad_token_id=tokenizer.eos_token_id  # Ensure padding token is set to eos_token_id
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = input("Enter seed text: ")
generated_texts = generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=2)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")


Enter seed text: Zen class of GUVI
Generated Text 1:
Zen class of GUVI is a leading edge platform that revolution ized the way computers work and lives, providing high quality education to a growing segment of the population 

GU VI offers a comprehensive range of courses, including Python, Java, C, C, JavaScript, and more, providing a comprehensive understanding of programming concepts, concepts, and concepts 

Gu vi offers a comprehensive library of software development tools, including Python, Java, C, JavaScript, and more, ensuring that learners can

Generated Text 2:
Zen class of GUVI is a premium course offered by GU VI that is aligned with industry expectations and is aligned with industry trends 

GU VI is a leading edge online learning platform that provides comprehensive, affordable, and accessible learning options for all ages, regardless of their educational background or skill level 

GU VI is an I IT M & II M A incub ated company that focuses on providing quality, afforda

In [1]:
!pip install transformers

