In [1]:
!pip install transformers datasets torch fastapi uvicorn



# data preprocessing

In [10]:
import os
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            tokenized_line = tokenizer.tokenize(line)
            f.write(" ".join(tokenized_line) + "\n")

input_file = "guvi_dataset.txt"  # Your company-specific data file
output_file = "processed_guvi_dataset.txt"
preprocess_data(input_file, output_file)

In [3]:
!pip install accelerate -U



**hugging face login**

In [11]:
# Install the transformers library if not already installed
!pip install transformers

# Import the necessary library
from huggingface_hub import login

# Login using the token
login(token="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")



**GPT 2**

In [12]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Specify the model name or identifier
model_name = "gpt2"  # You can use other models like "gpt2-medium", "gpt2-large", "gpt2-xl", "gpt-neo-125M", etc.

# Load the pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
prompt = "morning"
generated_text = text_generator(prompt, max_length=100, do_sample=True, temperature=0.7)

print(generated_text[0]['generated_text'])

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


morning, and have to sit down, especially after the rain has forced the roof to drop down to the ground. And we don't want to be the only ones having to move.

"I'm just worried about the house and the children, and the house that's right outside the house," she said. "I don't know what to do. I just want to be there."

Dennis said he's worried about the kids, too.

"I think I


In [6]:
!pip install accelerate -U



**finetuning the pretrained model**

In [13]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

train_dataset = load_dataset(output_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
500,1.7626


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [9]:
#5fe099e1bcce438b223c4af5ebfd775052022ed8

In [None]:
!pip show accelerate

**TESTING THE MODEL**

In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text
    inputs = tokenizer.encode_plus(seed_text, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            attention_mask=attention_mask
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts


# Test the model
seed_text = input()
generated_texts = generate_text(model, tokenizer, seed_text, max_length=50, temperature=0.8, num_return_sequences=3)

for i, text in enumerate(generated_texts):
    print(f"Generated Text : \n{text}\n")


what syllabus does guvi is using for datan scince
Generated Text : 
what syllabus does guvi is using for datan scince Ġand Ġother Ġtechnical Ġskills Ġis Ġthe ĠC ĠCurriculum ĠModule Ġof ĠC �

Generated Text : 
what syllabus does guvi is using for datan scince Ġcourse Ġin Ġa Ġlocal Ġlanguage Ġlike ĠTamil ? Ċ
Ċ
GU VI Ġoffers Ġ

Generated Text : 
what syllabus does guvi is using for datan scince / ĠGU VI Ġcourses ? Ċ
I . ĠGu vi Ġcourses Ġare Ġa Ġchoice Ġgap Ġ



**STEAMLIT APP ON THE LLM**

# ngrok

In [None]:
!pip install transformers fastapi uvicorn nest-asyncio pyngrok

In [28]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import nest_asyncio
from pyngrok import ngrok
import uvicorn

app = FastAPI()

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class Query(BaseModel):
    text: str

def generate_text(model, tokenizer, text, max_length=100, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    # Create attention mask
    attention_mask = torch.ones_like(input_ids)
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=top_k,
        top_p=0.95
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

@app.post("/generate/")
async def generate_text_post(query: Query):
    generated_text = generate_text(model, tokenizer, query.text)
    return {"generated_text": generated_text}

@app.get("/generatee/")
async def generate_text_get(text: str):
    generated_text = generate_text(model, tokenizer, text)
    return {"generated_text": generated_text}

# Authenticate ngrok
from pyngrok import conf, ngrok
conf.get_default().auth_token = "2rfBPMkPqKOeLDWUMSBxZG3GmyV_4WEGB5UsmRz2j2xtL8Rhi"

# Start ngrok
ngrok_tunnel = ngrok.connect(8000)
print("Public URL:", ngrok_tunnel.public_url)

# Allow nested asyncio
nest_asyncio.apply()

# Run the FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8000)

Public URL: https://1882-34-135-58-159.ngrok-free.app


INFO:     Started server process [4539]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     2401:4900:2348:b596:48db:16d4:48f0:7a24:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     2401:4900:2348:b596:48db:16d4:48f0:7a24:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INFO:     2401:4900:2348:b596:48db:16d4:48f0:7a24:0 - "GET /generatee/?text=Guvi%20courses%20that%20have%20in%20the%20institue HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INFO:     2401:4900:2348:b596:48db:16d4:48f0:7a24:0 - "GET /generatee/?text=Guvi%20offers%20a%20diverse%20range%20of%20courses HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [4539]


In [None]:
import requests

# Define the public URL and the endpoint
public_url = "https://1882-34-135-58-159.ngrok-free.app/"
endpoint = "/generate/"

# Combine the URL and endpoint
url = public_url + endpoint

# Define the data to be sent in the POST request
data = {
    "text": "Guvi offers a diverse range of courses"
}

# Send the POST request
response = requests.post(url, json=data)

# Print the response
print(response.json())

# STEAMLIT APP ON THE LLM


In [29]:
!pip install streamlit transformers torch

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[

In [35]:
%%writefile app.py
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
#model_name_or_path = "./fine_tuned_model"
model_name_or_path = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )
    generated_texts = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]
    return generated_texts

# Streamlit app
st.title("Text Generation with GPT-2")
st.write("This app generates text using a fine-tuned GPT-2 model. Enter a prompt and the model will generate a continuation.")

seed_text = st.text_input("Enter your prompt:", "Guvi offers a diverse range of courses")
max_length = st.slider("Max Length:", min_value=50, max_value=500, value=100)
temperature = st.slider("Temperature:", min_value=0.1, max_value=2.0, value=1.0)

if st.button("Generate"):
    with st.spinner("Generating text..."):
        generated_texts = generate_text(model, tokenizer, seed_text, max_length, temperature)
        for i, generated_text in enumerate(generated_texts):
            st.subheader(f"Generated Text {i + 3}")
            st.write(generated_text)

Overwriting app.py


In [32]:
!pip install pyngrok



# here the below code is for seeing app in streamlit

In [36]:
from pyngrok import conf, ngrok
import subprocess
import time

# Authenticate ngrok
conf.get_default().auth_token = "2rfBPMkPqKOeLDWUMSBxZG3GmyV_4WEGB5UsmRz2j2xtL8Rhi"

# Run the Streamlit app in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Give the Streamlit app a few seconds to start
time.sleep(5)

# Expose the Streamlit app to the web using ngrok
public_url = ngrok.connect(addr="8501")
print(f"Public URL: {public_url}")

# Keep the Colab cell running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit app...")
    process.terminate()
    ngrok.disconnect(public_url)
    ngrok.kill()

Public URL: NgrokTunnel: "https://d3c1-34-135-58-159.ngrok-free.app" -> "http://localhost:8501"
Stopping Streamlit app...
