<a href="https://colab.research.google.com/github/PK-Arivumathi/tictactoe/blob/main/tictactoebot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install pdfplumber with additional dependencies
!pip install pdfplumber --upgrade
!apt update
!apt install -y poppler-utils

# Install the remaining packages
!pip install transformers sentence-transformers torch scikit-learn


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Install necessary packages in Google Colab
!pip install pdfplumber transformers sentence-transformers torch scikit-learn

import pdfplumber
from transformers import T5ForConditionalGeneration, T5Tokenizer
from google.colab import files
import torch
import re
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load T5 model and tokenizer
model_name = 't5-small'  # You can change this to 't5-base' or 't5-large' if you want better accuracy
qa_model = T5ForConditionalGeneration.from_pretrained(model_name)
qa_tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load Sentence Transformer model for embedding-based retrieval
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to upload PDF
def upload_pdf():
    uploaded = files.upload()
    if uploaded:
        return list(uploaded.keys())[0]
    else:
        return None

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ""
        if not text.strip():
            raise ValueError("No text found in the PDF.")
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# Function to extract title from document
def extract_title(pdf_text):
    try:
        # Extract the first non-empty line as the title
        lines = pdf_text.split('\n')
        for line in lines[:5]:  # Look in the first 5 lines for a possible title
            if line.strip():
                return line.strip()
        return "Title not found"
    except Exception as e:
        return f"Error extracting title: {str(e)}"

# Function to chunk the document text into smaller pieces for efficient searching
def extract_chunks(pdf_text, chunk_size=500):
    sentences = pdf_text.split('\n')
    chunks = []
    current_chunk = []

    for sentence in sentences:
        if sentence.strip():
            current_chunk.append(sentence.strip())
        if len(" ".join(current_chunk).split()) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Function to generate an answer using T5 model
def get_t5_answer(query, context):
    try:
        input_text = f"question: {query} context: {context}"
        inputs = qa_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
        outputs = qa_model.generate(inputs['input_ids'], max_length=100, num_beams=4, early_stopping=True)
        answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Function to extract references from the document
def extract_references(pdf_text):
    try:
        # Define patterns to detect the start of the references section and the end of it
        references_pattern = re.search(r"(References|Citations|Bibliography)[\s\S]*", pdf_text, re.IGNORECASE)
        stop_sections = ["Appendix", "Conclusion", "Acknowledgements", "Notes", "About the Author"]

        if references_pattern:
            references_text = references_pattern.group(0)

            # Check if there are any section headers following the references
            for section in stop_sections:
                stop_pattern = re.search(rf"({section})", references_text, re.IGNORECASE)

                if stop_pattern:
                    references_text = references_text[:stop_pattern.start()]
                    break

            # Return the cleaned references section
            return references_text.strip() if references_text else "No references found."
        else:
            return "No references found in the document."
    except Exception as e:
        return f"Error extracting references: {str(e)}"

# Function to find the most relevant chunk for a given query using embeddings
def find_relevant_chunk(query, chunks):
    query_embedding = embedder.encode([query])
    chunk_embeddings = embedder.encode(chunks)
    similarities = cosine_similarity(query_embedding, chunk_embeddings)
    best_chunk_index = np.argmax(similarities)
    return chunks[best_chunk_index]

# Main chatbot function
def chatbot():
    pdf_path = upload_pdf()
    if not pdf_path:
        print("No PDF uploaded. Exiting...")
        return

    pdf_text = extract_text_from_pdf(pdf_path)
    if pdf_text.startswith("Error"):
        print(pdf_text)
        return

    print("Chatbot is ready! Ask your questions. Type 'exit' to quit.")

    title = extract_title(pdf_text)
    print(f"Document Title: {title}")

    pdf_chunks = extract_chunks(pdf_text)

    while True:
        query = input("You: ").strip()

        if query.lower() == 'exit':
            print("Exiting... Bye!")
            break

        if "references" in query.lower():
            references = extract_references(pdf_text)
            print(f"Bot: {references}")
            continue

        if not query:
            print("Bot: Please enter a valid question.")
            continue

        # Find the most relevant chunk and get T5's answer
        relevant_chunk = find_relevant_chunk(query, pdf_chunks)
        response = get_t5_answer(query, relevant_chunk)

        if response.startswith("Error"):
            print(f"Bot: {response}")
        else:
            print(f"Bot: {response}")

# Run the chatbot
chatbot()




Saving TICTACTOEAI.pdf to TICTACTOEAI (1).pdf
Chatbot is ready! Ask your questions. Type 'exit' to quit.
Document Title: Tic-Tac-Toe AI: Survey Paper
You: what are the limitations faced
Bot: AI approaches in Tic-Tac-Toe and their broader implications in the field of AI for game development
You: explain the performance eveluation
Bot: Minimax algorithm is the most widely used AI strategy for Tic-Tac-Toe
You: list the metrics in performance eve=aluation
Bot: Win Rate: The frequency with which the AI wins or avoids losing
You: list the topics under ai approaches for tic ta toe
Bot: This survey paper explores various AI strategies used to implement an intelligent agent for playing Tic-Tac-Toe
You: tic-tac-toe is also known as what
Bot: "Noughts and Crosses," is a two-player game that is widely recognized for its simplicity
You: how many positions does this game have
Bot: The game has 9 positions and only two possible symbols: 'X' and 'O.' The objective is to align three symbols in a row
Yo