Final model using Falcon 7B

In [None]:
# Step 1: Install required libraries
!pip install PyPDF2 langchain transformers sentence-transformers faiss-cpu accelerate pymupdf sympy pdfplumber pandas

# Install Poppler for handling PDFs
!apt-get update
!apt-get install -y poppler-utils

# Step 2: Import libraries
import fitz  # PyMuPDF
import faiss
import json
import os
import re
import pdfplumber
import pandas as pd
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from google.colab import files
from sympy import symbols, Eq, solve

# Step 3: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 4: Extract tables from PDF using pdfplumber
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_tables = page.extract_table()
            if extracted_tables:
                tables.append(pd.DataFrame(extracted_tables[1:], columns=extracted_tables[0]))
    return tables

# Step 5: Split text into chunks
def split_text_into_chunks(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    return splitter.split_text(text)

# Step 6: Create a FAISS VectorStore
def create_vectorstore(chunks):
    embedder = SentenceTransformer('all-mpnet-base-v2')
    embeddings = embedder.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, chunks, embedder

# Step 7: Query VectorStore for relevant content
def query_vectorstore(query, index, chunks, embedder, top_k=3):
    query_vector = embedder.encode([query])
    distances, indices = index.search(query_vector, k=top_k)
    return [chunks[i] for i in indices[0]]

# Step 8: Load lightweight and large models
def load_lightweight_model():
    model_name = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

def load_gpt_model():
    model_name = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
    return pipeline("text-generation", model=model, tokenizer=tokenizer)

# Step 9: Solve mathematical equations
def solve_math_equation(equation):
    try:
        equation = equation.replace("^", "**")  # Convert ^ to ** for Python syntax
        x = symbols("x")
        eq = Eq(eval(equation.split("=")[0]), eval(equation.split("=")[1]))
        solutions = solve(eq, x)
        return f"Solution: x = {solutions}"
    except Exception as e:
        return f"Error solving equation: {str(e)}"

# Step 10: Evaluate Boolean expressions
def evaluate_boolean_expression(expression):
    try:
        result = eval(expression)
        return f"Result: {result}"
    except Exception as e:
        return f"Error evaluating Boolean expression: {str(e)}"

# Step 11: Generate responses using GPT
def generate_answer(context, query, gpt_pipeline):
    input_prompt = f"Context: {context}\n\nAnswer the following question:\n{query}\n\nAnswer:"
    response = gpt_pipeline(input_prompt, max_new_tokens=150, do_sample=True, temperature=0.7)
    return response[0]['generated_text']

# Step 12: Save learner progress
def save_progress(data, filename="learner_progress.json"):
    if os.path.exists(filename):
        with open(filename, "r") as file:
            existing_data = json.load(file)
    else:
        existing_data = []
    existing_data.append(data)
    with open(filename, "w") as file:
        json.dump(existing_data, file, indent=4)

# Step 13: Main function
def main_pipeline(pdf_path):
    pdf_text = extract_text_from_pdf(pdf_path)
    tables = extract_tables_from_pdf(pdf_path)
    chunks = split_text_into_chunks(pdf_text)
    index, chunks, embedder = create_vectorstore(chunks)
    gpt_pipeline = load_gpt_model()
    return index, chunks, embedder, gpt_pipeline, tables

# Step 14: Upload PDF
uploaded_file = files.upload()
pdf_path = list(uploaded_file.keys())[0]
index, chunks, embedder, gpt_pipeline, tables = main_pipeline(pdf_path)

# Step 15: Interactive Learning Chatbot
def interactive_learning():
    print("\nPDF successfully loaded. Type any query below:")

    while True:
        user_input = input("\nYou: ").strip()

        if user_input.lower() in ["exit", "quit"]:
            print("Exiting interactive mode. Goodbye!")
            break

        # Math Equation Handling
        if re.search(r'[x\d\s\+\-\*/\^=]+', user_input) and "=" in user_input:
            print(solve_math_equation(user_input))
            continue

        # Boolean Expression Handling
        if any(op in user_input for op in ["AND", "OR", "NOT", "True", "False"]):
            print(evaluate_boolean_expression(user_input))
            continue

        # Table Query Handling
        if "table" in user_input.lower() and tables:
            print("Extracted Table Data:")
            for table in tables:
                print(table)
            continue

        # Vector Store Search (For Text Queries)
        results = query_vectorstore(user_input, index, chunks, embedder)
        context = " ".join(results)

        if context:
            response = generate_answer(context, user_input, gpt_pipeline)
            print(f"Assistant: {response}")
        else:
            print("Assistant: Sorry, I couldn't find relevant content. Try rephrasing.")

# Start interactive learning session
interactive_learning()


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-py3-non

Saving testing doc.pdf to testing doc (1).pdf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0



PDF successfully loaded. Type any query below:

You: Summarize Chapter 2 of the document


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Assistant: Context: milestones: 
 
- **Ancient Civilizations:**   
  Mesopotamia, Egypt, and the Indus Valley developed early forms of writing, architecture, and 
governance. 
   
- **Classical Period:**   
  Greek and Roman societies laid the foundations for modern law, philosophy, and arts. 
   
- **Medieval Times:**   
  The Middle Ages witnessed the rise of feudal systems, religious institutions, and the preservation of 
classical knowledge. 
   
- **The Renaissance and Enlightenment:**   
  These periods were marked by cultural rebirth and scientific inquiry, leading to breakthroughs in art, 
science, and human rights. 
   
- **Modern Era:**   
  The industrial revolution, world conflicts, and the digital age have reshaped societal structures and 
global interactions. 
 
This chapter not only serves as an educational resource but also as a test for text extraction and 
chunking across varied narrative styles. 
 
Chapter 3: Mathematical Concepts Comprehensive Sample Document 
 
Cha

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Assistant: Context: chunking across varied narrative styles. 
 
Chapter 3: Mathematical Concepts 
 
Mathematics is the language of the universe, underlying theories and applications in science and 
technology. In this chapter, we explore several mathematical ideas and problems: 
 
Example 1: Simple Arithmetic   
Evaluate the expression:   
2 + 3 * 4 - 5 
 
Example 2: Solving a Linear Equation   
Solve for x in:   
3x + 7 = 22 
 
Example 3: Quadratic Equation   
Consider the quadratic equation:   
x² - 5x + 6 = 0   
Find the values of x. 
 
Example 4: Calculus Problem   
Evaluate the integral:   
∫ (x² + 2x) dx from 0 to 1 
 
Each example is designed to test the pipeline’s ability to extract, process, and evaluate mathematical 
expressions. 
 
Chapter 4: Data Tables 
 
Tables are essential for presenting structured data clearly. Below is an example of a data table in CSV 
format: 
 
Name, Age, Department, Salary   
Alice, 28, Engineering, 85000   
Bob, 35, Marketing, 95000 computing and

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Assistant: Context: Comprehensive Sample Document 
 
Chapter 1: Introduction 
 
This document is created as a comprehensive sample for testing various features of a PDF processing 
pipeline. The purpose of this document is to provide sufficient content in multiple sections, chapters, 
and formats to enable thorough testing of text extraction, text chunking, vector search, 
mathematical problem solving, table processing, and interactive learning functionalities. Throughout 
the document, you will encounter diverse topics, including historical overviews, mathematical 
reasoning, data analysis, and logical evaluation. This document is designed to simulate a real-world 
educational resource and serve as a robust testing ground. 
 
Chapter 2: Historical Overview 
 
History has always been a crucial aspect of human civilization. Over the centuries, societies have 
evolved, bringing forth innovations that have shaped our modern world. Consider the following 
milestones: 
 
- **Ancient Civiliz

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Assistant: Context: strategy accordingly. This section is intended to provide a testing environment for vector search and 
content segmentation features. 
 
Chapter 7: Advanced Topics in Computer Science 
 
This chapter explores more advanced concepts that have a direct impact on the development of 
intelligent systems: 
 
- **Artificial Intelligence (AI):**   
  AI technologies, such as machine learning and deep learning, are transforming industries by 
enabling systems to learn from data and make autonomous decisions. 
   
- **Natural Language Processing (NLP):**   
  NLP enables computers to understand and interpret human language. This technology is vital for 
tasks like text summarization, sentiment analysis, and question-answering. 
   
- **Big Data and Cloud Computing:**   
  Handling and processing large datasets efficiently requires scalable solutions, often leveraging cloud 
computing and distributed processing systems. 
   
- **Cybersecurity:** Comprehensive Sample Document 