In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import os
import re
import sqlite3
from pathlib import Path
import PyPDF2
from typing import List, Dict, Optional

class DatabaseError(Exception):
    """Custom exception for database operations"""
    pass

class PDFError(Exception):
    """Custom exception for PDF operations"""
    pass

class QuestionExtractor:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.current_chapter = ""

    def extract_chapters_and_questions(self) -> List[Dict]:
        """Extract chapters and questions from PDF"""
        try:
            if not os.path.exists(self.pdf_path):
                raise PDFError(f"PDF file not found at {self.pdf_path}")

            with open(self.pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                all_text = ""

                for page in pdf_reader.pages:
                    all_text += page.extract_text() + "\n"

                return self._parse_content(all_text)

        except PyPDF2.PdfReadError as e:
            raise PDFError(f"Error reading PDF: {str(e)}")
        except Exception as e:
            raise PDFError(f"Unexpected error processing PDF: {str(e)}")

    def _parse_content(self, content: str) -> List[Dict]:
        """Parse content to extract chapters and questions"""
        questions = []
        chapter_pattern = r"Chapter \d+: (.+?)\n"
        question_pattern = r"(\d+)\.\s+(.*?)\nA\)(.*?)\nB\)(.*?)\nC\)(.*?)\nD\)(.*?)\nAnswer:\s+([A-D])\)(.*?)(?=\d+\.|$)"

        # Find all chapters
        chapter_matches = re.finditer(chapter_pattern, content)
        current_chapter = ""
        last_chapter_pos = 0

        for chapter_match in chapter_matches:
            current_chapter = chapter_match.group(1)
            chapter_start = chapter_match.end()

            # Find next chapter position
            next_chapter = re.search(chapter_pattern, content[chapter_start:])
            chapter_end = chapter_start + next_chapter.start() if next_chapter else len(content)

            # Extract questions for current chapter
            chapter_content = content[chapter_start:chapter_end]
            question_matches = re.finditer(question_pattern, chapter_content, re.DOTALL)

            for match in question_matches:
                question = {
                    'chapter_name': current_chapter,
                    'question_number': match.group(1),
                    'question_text': match.group(2).strip(),
                    'option_a': match.group(3).strip(),
                    'option_b': match.group(4).strip(),
                    'option_c': match.group(5).strip(),
                    'option_d': match.group(6).strip(),
                    'correct_answer': match.group(7),
                    'answer_text': match.group(8).strip()
                }
                questions.append(question)

            last_chapter_pos = chapter_end

        return questions

class DatabaseManager:
    def __init__(self, db_path: str = "chemistry_questions.db"):
        """Initialize database manager"""
        self.db_path = db_path
        self.conn = None
        self.cursor = None

    def connect(self):
        """Establish database connection"""
        try:
            self.conn = sqlite3.connect(self.db_path)
            self.cursor = self.conn.cursor()
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to connect to database: {str(e)}")

    def close(self):
        """Close database connection"""
        if self.conn:
            self.conn.close()

    def create_tables(self):
        """Create necessary tables if they don't exist"""
        try:
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS subjects (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    name TEXT NOT NULL UNIQUE
                )
            ''')

            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS chapters (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    subject_id INTEGER,
                    name TEXT NOT NULL,
                    FOREIGN KEY (subject_id) REFERENCES subjects(id),
                    UNIQUE(subject_id, name)
                )
            ''')

            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS questions (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    chapter_id INTEGER,
                    question_number INTEGER,
                    question_text TEXT NOT NULL,
                    option_a TEXT NOT NULL,
                    option_b TEXT NOT NULL,
                    option_c TEXT NOT NULL,
                    option_d TEXT NOT NULL,
                    correct_answer CHAR(1) NOT NULL,
                    answer_text TEXT NOT NULL,
                    FOREIGN KEY (chapter_id) REFERENCES chapters(id)
                )
            ''')

            self.conn.commit()
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to create tables: {str(e)}")

    def insert_subject(self, subject_name: str) -> int:
        """Insert subject and return its ID"""
        try:
            self.cursor.execute(
                "INSERT OR IGNORE INTO subjects (name) VALUES (?)",
                (subject_name,)
            )
            self.conn.commit()

            self.cursor.execute(
                "SELECT id FROM subjects WHERE name = ?",
                (subject_name,)
            )
            return self.cursor.fetchone()[0]
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to insert subject: {str(e)}")

    def insert_chapter(self, subject_id: int, chapter_name: str) -> int:
        """Insert chapter and return its ID"""
        try:
            self.cursor.execute(
                "INSERT OR IGNORE INTO chapters (subject_id, name) VALUES (?, ?)",
                (subject_id, chapter_name)
            )
            self.conn.commit()

            self.cursor.execute(
                "SELECT id FROM chapters WHERE subject_id = ? AND name = ?",
                (subject_id, chapter_name)
            )
            return self.cursor.fetchone()[0]
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to insert chapter: {str(e)}")

    def insert_question(self, question_data: Dict, chapter_id: int):
        """Insert question into database"""
        try:
            self.cursor.execute('''
                INSERT INTO questions (
                    chapter_id, question_number, question_text,
                    option_a, option_b, option_c, option_d,
                    correct_answer, answer_text
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                chapter_id,
                question_data['question_number'],
                question_data['question_text'],
                question_data['option_a'],
                question_data['option_b'],
                question_data['option_c'],
                question_data['option_d'],
                question_data['correct_answer'],
                question_data['answer_text']
            ))
            self.conn.commit()
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to insert question: {str(e)}")

def main():
    """Main function to coordinate PDF extraction and database storage"""
    content_dir = "/content"
    pdf_path = os.path.join(content_dir, "Chemistry Questions.pdf")
    db_path = os.path.join(content_dir, "chemistry_questions.db")

    try:
        # Create content directory if it doesn't exist
        Path(content_dir).mkdir(parents=True, exist_ok=True)

        # Initialize PDF extractor
        extractor = QuestionExtractor(pdf_path)
        questions = extractor.extract_chapters_and_questions()

        # Initialize database manager
        db_manager = DatabaseManager(db_path)
        db_manager.connect()
        db_manager.create_tables()

        # Insert subject
        subject_id = db_manager.insert_subject("Chemistry")

        # Track processed chapters to avoid duplication
        processed_chapters = set()

        # Process questions
        for question in questions:
            chapter_name = question['chapter_name']

            if chapter_name not in processed_chapters:
                chapter_id = db_manager.insert_chapter(subject_id, chapter_name)
                processed_chapters.add(chapter_name)

            db_manager.insert_question(question, chapter_id)

        print("Successfully processed PDF and stored questions in database")

    except PDFError as e:
        print(f"PDF Error: {str(e)}")
    except DatabaseError as e:
        print(f"Database Error: {str(e)}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
    finally:
        if 'db_manager' in locals():
            db_manager.close()

if __name__ == "__main__":
    main()

Successfully processed PDF and stored questions in database


In [26]:
import sqlite3

# Connect to database
conn = sqlite3.connect('/content/chemistry_questions.db')
cursor = conn.cursor()
# cursor.execute("""
#     SELECT c.subject_id, c.name
#     FROM chapters  c
# """)

# # Print results
# for chapterid,chaptername in cursor.fetchall():
#     print(f"chapter id is : {chapterid}")
#     print(f"chapter name is : {chaptername}")

# cursor.execute("""
#     SELECT q.chapter_id, q.question_number
#     FROM questions  q
# """)

# # Print results
# for chapterid,qno in cursor.fetchall():
#     print(f"chapter id is : {chapterid}")
#     print(f"Question number  is : {qno}")


#Example query: Get all questions for a specific chapter
cursor.execute("""
    SELECT q.question_text, q.correct_answer
    FROM questions q
    JOIN chapters c ON q.chapter_id =c.subject_id
    WHERE c.name = 'Basic concepts of chemistry'
""")
# Print results
for question, answer,chapterid,subjectid in cursor.fetchall():
    print(f"Q: {question}")
    print(f"A: {answer}\n")


