In [None]:
!pip install datasets
!pip install transformers
!pip install fitz
!pip install sentence_transformers
!pip install nltk
!pip install PyMuPDF
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import fitz
from sentence_transformers import SentenceTransformer, util
import torch
import string
import re
from google.colab import files
uploaded = files.upload()

nltk.download('punkt')
nltk.download('wordnet')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
stemmer = WordNetLemmatizer()

dataset = load_dataset('json', data_files={'train': 'talks.json'})

def extract_text_from_pdf(pdf_path):
    try:
        with fitz.open(pdf_path) as pdf_document:
            text = ""
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                text += page.get_text()
            return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""


def prepare_finetuning_data(text, chunk_size=1000):
    sentences = nltk.sent_tokenize(text)
    data = []

    for i in range(0, len(sentences), 2):
        prompt = " ".join(sentences[i:i+1])
        response = " ".join(sentences[i+1:i+2])
        if response:
            data.append({'prompt': prompt, 'response': response})
    return data

def tokenize_function(examples):
    inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs


def fine_tune_model(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    if not pdf_text.strip():
        print("No text extracted from the PDF for fine-tuning.")
        return

    pdf_data = prepare_finetuning_data(pdf_text)
    if not pdf_data:
        print("No data prepared for fine-tuning.")
        return

    pdf_dataset = Dataset.from_list(pdf_data)
    tokenized_dataset = pdf_dataset.map(tokenize_function, batched=True)
    tokenizer.pad_token = tokenizer.eos_token

    training_args = TrainingArguments(
        output_dir='./fine_tuned_gpt2_pdf_model',
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        save_steps=500,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    try:
        trainer.train()
        model.save_pretrained('./fine_tuned_gpt2_pdf')
        tokenizer.save_pretrained('./fine_tuned_gpt2_pdf')
        print("Model fine-tuning complete and saved.")
    except Exception as e:
        print(f"Error during fine-tuning: {e}")




Saving HumanResource2.pdf to HumanResource2 (1).pdf
Saving talks.json to talks (1).json


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import os

def load_fine_tuned_model(pdf_path='HumanResource2.pdf'):
    model_path = './fine_tuned_gpt2_pdf'
    if not os.path.exists(model_path):
        print("Model not found. Fine-tuning required.")
        fine_tune_model(pdf_path)
    else:
        try:
            model = GPT2LMHeadModel.from_pretrained(model_path)
            tokenizer = GPT2Tokenizer.from_pretrained(model_path)
            print("Fine-tuned model and tokenizer loaded.")
        except Exception as e:
            print(f"Error loading fine-tuned model/tokenizer: {e}")
            fine_tune_model(pdf_path)


In [None]:

def find_relevant_chunk(question, chunks):
    question_embedding = sentence_model.encode(question, convert_to_tensor=True)
    chunk_embeddings = sentence_model.encode(chunks, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(question_embedding, chunk_embeddings)[0]
    most_relevant_index = similarities.argmax().item()
    return chunks[most_relevant_index]


def generate_answer(question, relevant_chunk):
    if model is None or tokenizer is None:
        return "Model not loaded. Please try again later."

    prompt = (f"Based on the following text, answer the question clearly and concisely.\n\n"
              f"Text: {relevant_chunk}\n\nQuestion: {question}\nAnswer: ")

    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    answer = re.sub(r'\s+', ' ', answer)
    return answer if answer else "Sorry, I couldn't find an answer based on the provided text."


In [None]:
import difflib

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'(?<=\D)(?=\d)|(?<=\d)(?=\D)', ' ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    return text.lower()

def get_closest_match(question, json_responses):
    matches = difflib.get_close_matches(question, json_responses.keys(), n=1, cutoff=0.6)
    if matches:
        return json_responses[matches[0]]
    return None

def get_response(question, pdf_path='HumanResource2.pdf'):
    question = preprocess_text(question)
json_responses = {preprocess_text(item['prompt']): item['response'] for item in dataset['train']}

print("Sample json_responses keys:", list(json_responses.keys())[:5])

def get_response(question, pdf_path='HumanResource2.pdf'):
    question = preprocess_text(question)

    json_response = get_closest_match(question, json_responses)
    if json_response:
        return json_response

    text = extract_text_from_pdf(pdf_path)
    if not text:
        return "Sorry, I couldn't extract any relevant information from the PDF."

    def split_into_chunks(text, chunk_size=500):
        words = text.split()
        chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
        return chunks

    chunks = split_into_chunks(text)
    relevant_chunk = find_relevant_chunk(question, chunks)
    return generate_answer(question, relevant_chunk)

def chat():
    print("Start chatting with the bot (type 'quit' to stop)!")
    while True:
        inp = input("You: ")
        if inp.lower() == "quit":
            break
        answer = get_response(inp)
        print(f"Bot: {answer}")

chat()


Sample json_responses keys: ['hi', 'how are you', 'hi how are you', 'hello', 'good morning']
Start chatting with the bot (type 'quit' to stop)!
You: hi
Bot: Hello! How can I assist you today?
You: quit
