In [17]:
from dotenv import load_dotenv
from docx import Document
from PIL import Image
import pytesseract
import easyocr
import os
import json
import time
import google.generativeai as genai

load_dotenv() 

True

In [18]:
def check_api_key():
    """
    Check if Gemini API key is valid and working.
    
    Returns:
        dict: Status information about the API key
    """
    print("üîç Checking Gemini API key...")
    
    # Get API key - try ANTHROPIC_API_KEY first (from .env), then GEMINI_API_KEY
    api_key = os.getenv('GEMINI_API_KEY')
    
    if not api_key or api_key == '':
        return {
            "status": "‚ùå ERROR",
            "message": "No API key found. Please set ANTHROPIC_API_KEY or GEMINI_API_KEY in your .env file.",
            "valid": False
        }
    
    # Mask API key for display (show first 7 and last 4 characters)
    masked_key = f"{api_key[:7]}...{api_key[-4:]}" if len(api_key) > 11 else "***"
    print(f"üìù API Key: {masked_key}")
    
    try:
        genai.configure(api_key=api_key)
        
        # Make a simple test call
        print("üß™ Testing API connection...")
        model = genai.GenerativeModel('gemini-pro')
        response = model.generate_content("Say 'OK'")
        
        result_text = response.text.strip()
        
        print(f"‚úÖ API Key is VALID!")
        print(f"üì§ Response: {result_text}")
        
        return {
            "status": "‚úÖ VALID",
            "message": "API key is working correctly",
            "valid": True,
            "model_tested": "gemini-pro",
            "response": result_text
        }
        
    except Exception as e:
        error_msg = str(e)
        print(f"‚ùå API Key check FAILED: {error_msg}")
        
        return {
            "status": "‚ùå ERROR",
            "message": error_msg,
            "valid": False
        }

# Run the check
api_status = check_api_key()
print("\n" + "=" * 60)
print("API KEY STATUS SUMMARY:")
print("=" * 60)
print(json.dumps(api_status, indent=2))

üîç Checking Gemini API key...
üìù API Key: AIzaSyD...Tjuc
üß™ Testing API connection...
‚ùå API Key check FAILED: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.

API KEY STATUS SUMMARY:
{
  "status": "\u274c ERROR",
  "message": "404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.",
  "valid": false
}


In [19]:

# List available models
api_key = os.getenv('ANTHROPIC_API_KEY') or os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

print("üîç Available Gemini Models:")
print("=" * 60)
for model in genai.list_models():
    print(f"  ‚Ä¢ {model.name}")
print("=" * 60)

üîç Available Gemini Models:
  ‚Ä¢ models/gemini-2.5-flash
  ‚Ä¢ models/gemini-2.5-pro
  ‚Ä¢ models/gemini-2.0-flash
  ‚Ä¢ models/gemini-2.0-flash-001
  ‚Ä¢ models/gemini-2.0-flash-exp-image-generation
  ‚Ä¢ models/gemini-2.0-flash-lite-001
  ‚Ä¢ models/gemini-2.0-flash-lite
  ‚Ä¢ models/gemini-exp-1206
  ‚Ä¢ models/gemini-2.5-flash-preview-tts
  ‚Ä¢ models/gemini-2.5-pro-preview-tts
  ‚Ä¢ models/gemma-3-1b-it
  ‚Ä¢ models/gemma-3-4b-it
  ‚Ä¢ models/gemma-3-12b-it
  ‚Ä¢ models/gemma-3-27b-it
  ‚Ä¢ models/gemma-3n-e4b-it
  ‚Ä¢ models/gemma-3n-e2b-it
  ‚Ä¢ models/gemini-flash-latest
  ‚Ä¢ models/gemini-flash-lite-latest
  ‚Ä¢ models/gemini-pro-latest
  ‚Ä¢ models/gemini-2.5-flash-lite
  ‚Ä¢ models/gemini-2.5-flash-image
  ‚Ä¢ models/gemini-2.5-flash-preview-09-2025
  ‚Ä¢ models/gemini-2.5-flash-lite-preview-09-2025
  ‚Ä¢ models/gemini-3-pro-preview
  ‚Ä¢ models/gemini-3-flash-preview
  ‚Ä¢ models/gemini-3-pro-image-preview
  ‚Ä¢ models/nano-banana-pro-preview
  ‚Ä¢ models/gemini-robotic

In [20]:
def extract_text(filepath, ocr_method='easyocr', languages=['en']):
    """
    Extract text from any document type (PDF, DOCX, or Image).
    Works as standard for all document formats.
    
    Args:
        filepath (str): Path to document file
        ocr_method (str): 'easyocr' for images (default)
        languages (list): Languages for OCR (default: English)
        
    Returns:
        str: Extracted text from document
    """
    file_ext = os.path.splitext(filepath)[1].lower()
    
    try:
        # Handle PDF files
        if file_ext == '.pdf':
            import fitz
            text = ""
            doc = fitz.open(filepath)
            for page_num, page in enumerate(doc):
                text += f"\n--- Page {page_num + 1} ---\n"
                text += page.get_text()
            doc.close()
            return text
        
        # Handle Word documents
        elif file_ext == '.docx':
            from docx import Document
            text = ""
            doc = Document(filepath)
            for para in doc.paragraphs:
                if para.text.strip():
                    text += para.text + "\n"
            for table in doc.tables:
                for row in table.rows:
                    row_text = [cell.text for cell in row.cells]
                    text += " | ".join(row_text) + "\n"
            return text
        
        # Handle image files
        elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff']:
            reader = easyocr.Reader(languages)
            results = reader.readtext(filepath)
            text = " ".join([detection[1] for detection in results])
            return text
        
        else:
            print(f"‚ùå Error: Unsupported file type '{file_ext}'")
            return ""
    
    except Exception as e:
        print(f"‚ùå Error extracting text: {str(e)}")
        return ""

In [21]:
def extract_information_ai(text, extraction_instructions=None, max_retries=3, retry_delay=5):
    """
    Use Google Gemini AI to intelligently extract relevant information from document.
    Automatically filters out irrelevant content, boilerplate, and noise.
    Includes automatic retry logic for transient API errors.
    
    Args:
        text (str): Full extracted document text
        extraction_instructions (str): Specific instructions on what to extract
        max_retries (int): Maximum number of retry attempts (default: 3)
        retry_delay (int): Base delay in seconds between retries (default: 5)
        
    Returns:
        dict: Extracted information as structured JSON
    """
    # Get API key from environment variable - try ANTHROPIC_API_KEY first, then GEMINI_API_KEY
    api_key = os.getenv('ANTHROPIC_API_KEY') or os.getenv('GEMINI_API_KEY')
    if not api_key:
        return {
            "error": "No API key found. Please set ANTHROPIC_API_KEY or GEMINI_API_KEY in your .env file.",
            "retries_attempted": 0,
            "note": "Set API key environment variable before running.",
            "model_used": None
        }
    genai.configure(api_key=api_key)
    
    if extraction_instructions is None:
        extraction_instructions = """Extract all important and relevant information from this document.
        Ignore filler, boilerplate, headers, footers, and irrelevant content.
        Return comprehensive structured data with all key details."""
    
    # Truncate text if too long (Gemini has 32k context, but we'll be conservative)
    MAX_TEXT_LENGTH = 50000  # Leave room for prompt
    original_text_length = len(text)
    if len(text) > MAX_TEXT_LENGTH:
        print(f"‚ö†Ô∏è  Text is very long ({len(text)} chars). Truncating to {MAX_TEXT_LENGTH} chars...")
        text = text[:MAX_TEXT_LENGTH] + "\n\n[Document truncated due to length...]"
    
    prompt = f"""You are an expert information extraction specialist.

Your task: {extraction_instructions}

From the document below, extract ONLY the most important and relevant information.
Ignore any irrelevant, redundant, boilerplate, footer, or noise content.

IMPORTANT JSON FORMAT REQUIREMENT FOR TOPICS ONLY:
- Extract all other information normally in their respective fields (course details, faculty, assessments, etc.)
- For topics: Collect ALL topics from the document (from all units, weeks, sections) and combine them into a SINGLE "topics" array
- If topics appear in multiple places (Unit 1 topics, Unit 2 topics, weekly topics, etc.), merge them all into one "topics" array
- Each topic should be a string in the array
- Example: {{"course_code": "...", "faculty": {{...}}, "topics": ["Topic from Unit 1", "Topic from Unit 2", "Topic from Week 1", ...], "assessments": {{...}}}}

Return the extracted information as valid JSON. Be comprehensive but concise.

DOCUMENT:
---
{text}
---

Return ONLY valid JSON format. No additional text or explanation. Keep all other fields separate, but combine ALL topics into one "topics" array."""
    
    models_to_try = ["gemini-2.5-flash", "gemini-2.5-pro"]
    current_model_idx = 0
    last_error = None
    successful_model = None
    
    for attempt in range(1, max_retries + 1):
        try:
            # Try current model
            model_name = models_to_try[current_model_idx]
            
            if attempt > 1:
                print(f"üîÑ Retrying with {model_name} (attempt {attempt}/{max_retries})...")
            
            # Initialize Gemini model
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)
            
            response_text = response.text.strip()
            successful_model = model_name
            
            try:
                result = json.loads(response_text)
                return {
                    "extracted_information": result,
                    "model_used": successful_model
                }
            except json.JSONDecodeError:
                return {
                    "extracted_information": {"extracted_content": response_text},
                    "model_used": successful_model
                }
        
        except Exception as e:
            # Handle any API errors
            last_error = str(e)
            error_str_lower = last_error.lower()
            
            print(f"‚ùå Error with {models_to_try[current_model_idx]} (attempt {attempt}/{max_retries}): {last_error}")
            
            # Determine if we should retry
            should_retry = False
            
            if "quota" in error_str_lower or "rate_limit" in error_str_lower:
                print(f"‚è≥ Rate limit or quota hit. Waiting before retry...")
                should_retry = attempt < max_retries
                if should_retry:
                    wait_time = retry_delay * (2 ** (attempt - 1))
                    time.sleep(wait_time)
            elif "500" in last_error or "internal" in error_str_lower:
                print(f"üîÑ Server error. Retrying...")
                should_retry = attempt < max_retries
                if should_retry:
                    time.sleep(retry_delay * attempt)
            elif attempt < max_retries:
                # Try next model
                if current_model_idx < len(models_to_try) - 1:
                    current_model_idx += 1
                    print(f"üîÑ Switching to {models_to_try[current_model_idx]}...")
                    should_retry = True
                else:
                    # Retry with same model after waiting
                    should_retry = True
                    wait_time = retry_delay * attempt
                    print(f"‚è≥ Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
            
            if not should_retry:
                break
            continue
    
    # Final error return
    return {
        "error": last_error or "Unknown error during AI extraction",
        "retries_attempted": max_retries,
        "note": "Failed to extract information. Check API key and try again.",
        "model_used": None
    }

In [22]:
# file_path = "document.pdf"  # Change this to your actual file
# # extraction_task = "Extract all important information"  # Change to your needs

# # Example 1: Extract from PDF
# # print("=" * 60)
# # print("Extracting from PDF...")
# # print("=" * 60)
# # pdf_text = extract_text("document.pdf")
# # print(f"Extracted {len(pdf_text)} characters\n")

# # # Extract information with custom instructions
# # result = extract_information_ai(
# #     pdf_text,
# #     extraction_instructions="Extract all important facts, dates, names, and numbers"
# # )
# # print(json.dumps(result, indent=2))

# # Example 2: Extract from Word document
# print("\n" + "=" * 60)
# print("Extracting from Word document...")
# print("=" * 60)
# docx_text = extract_text("document.docx")
# print(f"Extracted {len(docx_text)} characters\n")

# result = extract_information_ai(
#     docx_text,
#     extraction_instructions="Extract key information and details"
# )
# print(json.dumps(result, indent=2))

# # # Example 3: Extract from Image
# # print("\n" + "=" * 60)
# # print("Extracting from Image...")
# # print("=" * 60)
# # img_text = extract_text("image.jpg")
# # print(f"Extracted {len(img_text)} characters\n")

# # result = extract_information_ai(
# #     img_text,
# #     extraction_instructions="Extract all text and important information"
# # )
# # print(json.dumps(result, indent=2))

In [23]:
file_path = "./document.docx"

print(f"Extracting text from: {file_path}")
extracted_text = extract_text(file_path)
print(f"Extracted {len(extracted_text)} characters\n")
print("=" * 60)
print("EXTRACTED TEXT:")
print("=" * 60)
print(extracted_text)
print("=" * 60)

Extracting text from: ./document.docx
Extracted 21096 characters

EXTRACTED TEXT:
BCA 301-4 DOT NET COURSE PLAN
SECTION I
LAB EXERCISES
SECTION III
Mapping: 
 Mapping of needs with the syllabus:
Please map the COs and Unit/Topic details to the following wherever applicable. If not applicable, please enter that.
Mapping:  A template to map the Learning Outcomes of the course against the components of assessment is given below:
Assessment outline:
 SECTION IV
Assignment Component:
A template to map the Course Outcomes against the components of the  assessment is given below:
Assessment outline:
Assessment Description: CIA ‚Äì I
Evaluation Rubrics:
CIA ‚Äì II (Mid Semester Exam)- 50 Marks-Centralized
Assessment Description: ESE - I 
 Evaluation rubrics:
 Mapping the Learning Outcomes with components of the evaluation rubrics:
Assessment Description: ESE ‚Äì II 
Evaluation Rubrics:
Assessment Description: ESE ‚Äì III
 Evaluation Rubrics for project:
* * * * * * * * * 
Semester | IV | Progr

In [24]:
print("\nExtracting information with AI...")
result = extract_information_ai(extracted_text)
print("\n" + "=" * 60)
print("EXTRACTED INFORMATION:")
print("=" * 60)

# Display model used separately (outside JSON)
if "model_used" in result and result["model_used"]:
    print(f"ü§ñ Model Used: {result['model_used']}")
    print("=" * 60)
    print("\nüìÑ Extracted Data (JSON):")
    print("=" * 60)

# Display the extracted information (JSON)
if "extracted_information" in result:
    print(json.dumps(result["extracted_information"], indent=2))
elif "error" in result:
    print(json.dumps(result, indent=2))
else:
    print(json.dumps(result, indent=2))


Extracting information with AI...
‚ùå Error with gemini-2.5-flash (attempt 1/3): 403 Your API key was reported as leaked. Please use another API key.
üîÑ Switching to gemini-2.5-pro...
üîÑ Retrying with gemini-2.5-pro (attempt 2/3)...
‚ùå Error with gemini-2.5-pro (attempt 2/3): 403 Your API key was reported as leaked. Please use another API key.
‚è≥ Waiting 10 seconds before retry...
üîÑ Retrying with gemini-2.5-pro (attempt 3/3)...
‚ùå Error with gemini-2.5-pro (attempt 3/3): 403 Your API key was reported as leaked. Please use another API key.

EXTRACTED INFORMATION:
{
  "error": "403 Your API key was reported as leaked. Please use another API key.",
  "retries_attempted": 3,
  "note": "Failed to extract information. Check API key and try again.",
  "model_used": null
}
