In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers datasets torch fastapi uvicorn



In [None]:
# Uninstall conflicting packages
!pip uninstall pyarrow -y
!pip uninstall requests -y

# Install specific versions
!pip install pyarrow==14.0.1
!pip install requests==2.31.0

# Reinstall dependencies
!pip install cudf-cu12
!pip install ibis-framework

# Verify installation
import pyarrow
import requests

print(f"pyarrow version: {pyarrow.__version__}")
print(f"requests version: {requests.__version__}")


Found existing installation: pyarrow 16.1.0
Uninstalling pyarrow-16.1.0:
  Successfully uninstalled pyarrow-16.1.0
Found existing installation: requests 2.32.3
Uninstalling requests-2.32.3:
  Successfully uninstalled requests-2.32.3
Collecting pyarrow==14.0.1
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.9.0 requires requests>=2.27.1, which is not installed.
datasets 2.20.0 requires requests>=2.32.2, which is not installed.
tensorflow-datasets 4.9.6 requires requests>=2.19.0, which is not installed.
datasets 2.20.0 requires pyarrow>=15.0.0, but you have pyarrow 14.0.1 which is incompatible.[0m[31m
[0mSucces

In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m174.1/309.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
import os
import re
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            # Strip leading/trailing whitespace
            line = line.strip()

            # Additional preprocessing steps
            # Example: Convert all text to lowercase
            line = line.lower()

            # Example: Replace specific characters or patterns
            line = re.sub(r'\W', ' ', line)  # Replace non-word characters with space

            # Tokenize the line
            token_ids = tokenizer.encode(line, add_special_tokens=False)
            # Convert token IDs back to tokens
            tokenized_line = tokenizer.convert_ids_to_tokens(token_ids)
            # Convert tokens to text and remove special tokens
            processed_line = " ".join(tokenized_line).replace('Ġ', '').replace('Ċ', '').replace('�', '').strip()
            processed_line = re.sub(r'[a-zA-z0-9\s],():','',processed_line)
            processed_line = re.sub(r'\bgu vi\b','guvi',processed_line)
            # Remove extra spaces
            processed_line = re.sub(r'\s+', ' ', processed_line)

            # Write the processed line to the output file
            f.write(processed_line + "\n")

# Example usage:
input_file = "/content/drive/MyDrive/guvicompany_data.txt"  # Make sure this path is correct
output_file = "processed_company_data.txt"
preprocess_data(input_file, output_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

train_dataset = load_dataset(output_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=200,

)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("/content/drive/My Drive/fine_tuned_model12345")
tokenizer.save_pretrained("/content/drive/My Drive/fine_tuned_model12345")



Step,Training Loss
200,3.5493
400,2.4149
600,1.794
800,1.4021
1000,1.0782
1200,0.9006
1400,0.7689
1600,0.6679
1800,0.6084


('/content/drive/My Drive/fine_tuned_model12345/tokenizer_config.json',
 '/content/drive/My Drive/fine_tuned_model12345/special_tokens_map.json',
 '/content/drive/My Drive/fine_tuned_model12345/vocab.json',
 '/content/drive/My Drive/fine_tuned_model12345/merges.txt',
 '/content/drive/My Drive/fine_tuned_model12345/added_tokens.json')

In [None]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/drive/My Drive/fine_tuned_model12345"  # Use the directory where you saved the model
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)

token_name_or_path = "/content/drive/My Drive/fine_tuned_model12345"  # Use the directory where you saved the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(token_name_or_path)

# Set the pad_token to eos_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text with padding
    inputs = tokenizer(seed_text, return_tensors='pt', padding=True, truncation=True)

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.01,
            pad_token_id=tokenizer.eos_token_id  # Ensure padding token is set to eos_token_id
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = input("Enter seed text: ")
generated_texts = generate_text(model, tokenizer, seed_text, max_length=50, temperature=0.000001, num_return_sequences=1)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")


Enter seed text: what are the courses available in guvi?
Generated Text 1:
what are the courses available in guvi?

guvi offers a range of courses in various fields including programming data science machine learning data science and more guvi offers courses in several different vern acular languages including tam il tel ugu hind

