## **Data Preprocessing**

In [1]:
!pip install transformers datasets torch fastapi uvicorn

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3

In [4]:
import os
import re
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            line = line.strip()
            line = line.lower()
            line = re.sub(r'\s+', ' ', line)  # Normalize spaces
            line = re.sub(r'\W', ' ', line)  # Replace non-word characters with space
            token_ids = tokenizer.encode(line, add_special_tokens=False)
            tokenized_line = tokenizer.convert_ids_to_tokens(token_ids)
            processed_line = " ".join(tokenized_line).replace('Ġ', '').replace('Ċ', '').replace('�', '').strip()
            processed_line = re.sub(r'\s+', ' ', processed_line)  # Remove extra spaces again
            f.write(processed_line + "\n")

input_file = "/content/guvi.txt"
output_file = "processed_guvi.txt"
preprocess_data(input_file, output_file)

In [3]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

train_dataset = load_dataset(output_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")



Step,Training Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

## **Finetuning the GPT-Model**

In [6]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer

model_name_or_path = "./fine_tuned_model"  # Use the directory where you saved the model
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)

token_name_or_path = "./fine_tuned_model"  # Use the directory where you saved the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(token_name_or_path)


# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

In [None]:
seed_text = input()
generated_texts = generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=3)
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")

In [7]:
!pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.0.0-cp310-cp310-manylinux_2_17_x86_64.whl (19.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.0.0


In [8]:
import mysql.connector

connection = mysql.connector.connect(host="gateway01.ap-southeast-1.prod.aws.tidbcloud.com",port = 4000,
  user = "3bUwYvrRnEGpqae.root",
  password = "ekS17vv8KwPWdZs0",
  database='guvidb')
mycursor = connection.cursor(buffered=True)

In [None]:
mycursor.execute('Create database IF NOT EXISTS guvidb')

In [None]:
mycursor.execute('create table IF NOT EXISTS guvidb.guvi_table(Username VARCHAR(255)PRIMARY KEY,Password VARCHAR(255))')

In [9]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [10]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


## **Streamlit on the GPT Model**

In [15]:
%%writefile app.py
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import mysql.connector

# Database connection
connection = mysql.connector.connect(
    host="gateway01.ap-southeast-1.prod.aws.tidbcloud.com",
    port=4000,
    user="3bUwYvrRnEGpqae.root",
    password="ekS17vv8KwPWdZs0",
    database="guvidb"
)
mycursor = connection.cursor(buffered=True)

model_name_or_path = "./fine_tuned_model"  # Use the directory where you saved the model
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)

token_name_or_path = "./fine_tuned_model"  # Use the directory where you saved the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(token_name_or_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )
    generated_texts = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]
    return generated_texts

def insert_credentials(username, password):
    try:
        mycursor.execute('SELECT * FROM guvidb.guvi_table WHERE Username = %s', (username,))
        if mycursor.fetchone():
            st.error("Username already exists. Please choose a different one.")
        else:
            mycursor.execute('INSERT INTO guvidb.guvi_table (Username, Password) VALUES (%s, %s)', (username, password))
            connection.commit()
    except mysql.connector.Error as err:
        st.error(f"Error inserting credentials: {err}")

def authenticate(username, password):
    mycursor.execute('SELECT * FROM guvidb.guvi_table WHERE Username = %s AND Password = %s', (username, password))
    user = mycursor.fetchone()
    return user is not None

def forgot_password(username, new_password):
    try:
        mycursor.execute('SELECT * FROM guvidb.guvi_table WHERE Username = %s', (username,))
        if mycursor.fetchone():
            mycursor.execute('UPDATE guvidb.guvi_table SET Password = %s WHERE Username = %s', (new_password, username))
            connection.commit()
            st.success("Password updated successfully!")
        else:
            st.error("Username not found.")
    except mysql.connector.Error as err:
        st.error(f"Error updating password: {err}")

# Initialize session state variables
if 'page' not in st.session_state:
    st.session_state.page = 'login'
if 'authenticated' not in st.session_state:
    st.session_state.authenticated = False

scrolling_text = """
<h1 style='color:Indigo; font-weight: bold;'>
    <marquee behavior="scroll" direction="left" scrollamount="10">WELCOME TO GUVI GPT</marquee>
</h1>
"""
st.markdown(scrolling_text, unsafe_allow_html=True)

# Login Page
if st.session_state.page == 'login':
    st.markdown("<h2 style='color: violet; text-align: center;'>Login</h2>", unsafe_allow_html=True)
    st.markdown("<h3 style='color: red;'>Username</h3>", unsafe_allow_html=True)
    username = st.text_input("Enter Your Username", key="login_username")

    st.markdown("<h3 style='color: red;'>Password</h3>", unsafe_allow_html=True)
    password = st.text_input("Enter Your Password", type="password", key="login_password")

    if st.button("Login"):
        if authenticate(username, password):
            st.session_state.authenticated = True
            st.success("Logged in successfully!")
        else:
            st.error("Incorrect login credentials")

    # If authenticated, show text generation
    if st.session_state.authenticated:
        st.markdown("<h2 style='color: red; text-align: center;'>Text Generation</h2>", unsafe_allow_html=True)
        st.info("Disclaimer: GUVIGPT can make mistakes. The content generated by the model may not always be accurate or appropriate. Please use it responsibly.")
        seed_text = st.text_input("Enter your prompt:")
        max_length = st.slider("Max Length:", min_value=50, max_value=500, value=100)
        temperature = st.slider("Temperature:", min_value=0.1, max_value=2.0, value=1.0)

        if st.button("Generate"):
            if seed_text.strip():
                with st.spinner("Generating text..."):
                    generated_texts = generate_text(seed_text, max_length, temperature)
                    for i, generated_text in enumerate(generated_texts):
                        st.subheader(f"Generated Text {i + 1}")
                        st.write(generated_text)
            else:
                st.warning("Please enter a prompt to generate text.")

    st.markdown("<br>", unsafe_allow_html=True)
    col1, col2 = st.columns(2)

    with col1:
        st.markdown("<h4 style='color: blue;'>Forgot Password?</h4>", unsafe_allow_html=True)
        if st.button("Reset Password"):
            st.session_state.page = 'forgot_password'
            st.experimental_rerun()
    with col2:
        st.markdown("<h4 style='color: blue;'>New user?</h4>", unsafe_allow_html=True)
        if st.button("Go to Sign Up"):
            st.session_state.page = 'signup'
            st.experimental_rerun()

# Forgot Password Page
elif st.session_state.page == 'forgot_password':
    st.markdown("<h2 style='color: yellow;'>Forgot Password</h2>", unsafe_allow_html=True)
    st.markdown("<h3 style='color: blue;'>Forgot Password</h3>", unsafe_allow_html=True)
    with st.form(key='forgot_password_form'):
        forgot_username = st.text_input("Enter your username for password reset")
        new_password = st.text_input("Enter new password", type="password")
        forgot_password_button = st.form_submit_button("Reset Password")

    if forgot_password_button:
        if forgot_username and new_password:
            forgot_password(forgot_username, new_password)
            st.success("Password reset successfully!")
        else:
            st.error("Please provide both username and new password.")

    if st.button("Back to Login"):
        st.session_state.page = 'login'
        st.session_state.authenticated = False
        st.experimental_rerun()

# Sign Up Page
elif st.session_state.page == 'signup':
    st.markdown("<h2 style='color: red;'>Sign Up</h2>", unsafe_allow_html=True)
    st.markdown("<h3 style='color: blue;'>Create a New Account</h3>", unsafe_allow_html=True)
    with st.form(key='create_account_form'):
        new_username = st.text_input("New Username")
        new_password = st.text_input("New Password", type="password")
        create_account_button = st.form_submit_button("Create Account")

    if create_account_button:
        if new_username and new_password:
            insert_credentials(new_username, new_password)
            st.success("Account created successfully!")
        else:
            st.error("Please provide both username and password.")

    if st.button("Back to Login"):
        st.session_state.page = 'login'
        st.session_state.authenticated = False
        st.experimental_rerun()

Writing app.py


In [12]:
from pyngrok import conf, ngrok
import subprocess
import time

# Authenticate ngrok
conf.get_default().auth_token ="2hsPZaA1WEFzj9dhivkD7RW81T7_2qZtyZistfNUxUMaiDoyt"

# Run the Streamlit app in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Give the Streamlit app a few seconds to start

time.sleep(5)

# Expose the Streamlit app to the web using ngrok
public_url = ngrok.connect(addr="8501")
print(f"Public URL: {public_url}")

# Keep the Colab cell running
try:
    while True:
      time.sleep(1)
except KeyboardInterrupt:
    print("Stopping Streamlit app...")
    process.terminate()
    ngrok.disconnect(public_url)
    ngrok.kill()



KeyboardInterrupt: 