In [8]:
from dotenv import load_dotenv
from docx import Document
from PIL import Image
import pytesseract
import easyocr
import os
import json
import time
from openai import OpenAI, APIError, APIConnectionError, RateLimitError

load_dotenv() 

True

In [9]:
def check_api_key():
    """
    Check if OpenAI API key is valid and working.
    
    Returns:
        dict: Status information about the API key
    """
    print("üîç Checking OpenAI API key...")
    
    # Get API key
    api_key = os.getenv('OPENAI_API_KEY')
    
    if not api_key or api_key == '':
        return {
            "status": "‚ùå ERROR",
            "message": "No API key found. Please set OPENAI_API_KEY in your .env file.",
            "valid": False
        }
    
    # Mask API key for display (show first 7 and last 4 characters)
    masked_key = f"{api_key[:7]}...{api_key[-4:]}" if len(api_key) > 11 else "***"
    print(f"üìù API Key: {masked_key}")
    
    try:
        client = OpenAI(api_key=api_key)
        
        # Make a simple test call
        print("üß™ Testing API connection...")
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            max_tokens=10,
            messages=[{"role": "user", "content": "Say 'OK'"}]
        )
        
        result_text = response.choices[0].message.content.strip()
        
        print(f"‚úÖ API Key is VALID!")
        print(f"üì§ Response: {result_text}")
        
        return {
            "status": "‚úÖ VALID",
            "message": "API key is working correctly",
            "valid": True,
            "model_tested": "gpt-3.5-turbo",
            "response": result_text
        }
        
    except APIError as e:
        status_code = getattr(e, 'status_code', None)
        error_type = getattr(e, 'type', None)
        
        if status_code == 401:
            error_msg = "Invalid API key. Please check your API key."
        elif status_code == 429:
            error_msg = "Rate limit exceeded. API key is valid but you've hit rate limits."
        elif status_code == 500:
            error_msg = "OpenAI server error. API key may be valid but server is having issues."
        else:
            error_msg = f"API Error: {error_type or str(e)}"
        
        print(f"‚ùå API Key check FAILED: {error_msg}")
        
        return {
            "status": "‚ùå ERROR",
            "message": error_msg,
            "valid": False,
            "status_code": status_code,
            "error_type": error_type
        }
        
    except APIConnectionError as e:
        error_msg = f"Connection error: {str(e)}"
        print(f"‚ùå Connection FAILED: {error_msg}")
        
        return {
            "status": "‚ùå CONNECTION ERROR",
            "message": error_msg,
            "valid": False
        }
        
    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        print(f"‚ùå Unexpected error: {error_msg}")
        
        return {
            "status": "‚ùå ERROR",
            "message": error_msg,
            "valid": False
        }

# Run the check
api_status = check_api_key()
print("\n" + "=" * 60)
print("API KEY STATUS SUMMARY:")
print("=" * 60)
print(json.dumps(api_status, indent=2))

üîç Checking OpenAI API key...
üìù API Key: sk-proj...seMA
üß™ Testing API connection...
‚ùå API Key check FAILED: Rate limit exceeded. API key is valid but you've hit rate limits.

API KEY STATUS SUMMARY:
{
  "status": "\u274c ERROR",
  "message": "Rate limit exceeded. API key is valid but you've hit rate limits.",
  "valid": false,
  "status_code": 429,
  "error_type": "insufficient_quota"
}


In [10]:
def extract_text(filepath, ocr_method='easyocr', languages=['en']):
    """
    Extract text from any document type (PDF, DOCX, or Image).
    Works as standard for all document formats.
    
    Args:
        filepath (str): Path to document file
        ocr_method (str): 'easyocr' for images (default)
        languages (list): Languages for OCR (default: English)
        
    Returns:
        str: Extracted text from document
    """
    file_ext = os.path.splitext(filepath)[1].lower()
    
    try:
        # Handle PDF files
        if file_ext == '.pdf':
            import fitz
            text = ""
            doc = fitz.open(filepath)
            for page_num, page in enumerate(doc):
                text += f"\n--- Page {page_num + 1} ---\n"
                text += page.get_text()
            doc.close()
            return text
        
        # Handle Word documents
        elif file_ext == '.docx':
            from docx import Document
            text = ""
            doc = Document(filepath)
            for para in doc.paragraphs:
                if para.text.strip():
                    text += para.text + "\n"
            for table in doc.tables:
                for row in table.rows:
                    row_text = [cell.text for cell in row.cells]
                    text += " | ".join(row_text) + "\n"
            return text
        
        # Handle image files
        elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff']:
            reader = easyocr.Reader(languages)
            results = reader.readtext(filepath)
            text = " ".join([detection[1] for detection in results])
            return text
        
        else:
            print(f"‚ùå Error: Unsupported file type '{file_ext}'")
            return ""
    
    except Exception as e:
        print(f"‚ùå Error extracting text: {str(e)}")
        return ""

In [11]:
def extract_information_ai(text, extraction_instructions=None, max_retries=3, retry_delay=5):
    """
    Use OpenAI GPT to intelligently extract relevant information from document.
    Automatically filters out irrelevant content, boilerplate, and noise.
    Includes automatic retry logic for transient API errors.
    
    Args:
        text (str): Full extracted document text
        extraction_instructions (str): Specific instructions on what to extract
        max_retries (int): Maximum number of retry attempts (default: 3)
        retry_delay (int): Base delay in seconds between retries (default: 5)
        
    Returns:
        dict: Extracted information as structured JSON
    """
    # Get API key from environment variable
    api_key = os.getenv('OPENAI_API_KEY')
    if not api_key:
        return {
            "error": "No API key found. Please set OPENAI_API_KEY in your .env file.",
            "retries_attempted": 0,
            "note": "Set OPENAI_API_KEY environment variable before running.",
            "model_used": None
        }
    client = OpenAI(api_key=api_key)
    
    if extraction_instructions is None:
        extraction_instructions = """Extract all important and relevant information from this document.
        Ignore filler, boilerplate, headers, footers, and irrelevant content.
        Return comprehensive structured data with all key details."""
    
    # Truncate text if too long (GPT-4o-mini has 128k context, but we'll be conservative)
    # Rough estimate: 1 token ‚âà 4 characters, so 100k chars ‚âà 25k tokens
    MAX_TEXT_LENGTH = 100000  # ~25k tokens, leaving room for prompt
    original_text_length = len(text)
    if len(text) > MAX_TEXT_LENGTH:
        print(f"‚ö†Ô∏è  Text is very long ({len(text)} chars). Truncating to {MAX_TEXT_LENGTH} chars...")
        text = text[:MAX_TEXT_LENGTH] + "\n\n[Document truncated due to length...]"
    
    prompt = f"""You are an expert information extraction specialist.

Your task: {extraction_instructions}

From the document below, extract ONLY the most important and relevant information.
Ignore any irrelevant, redundant, boilerplate, footer, or noise content.

IMPORTANT JSON FORMAT REQUIREMENT FOR TOPICS ONLY:
- Extract all other information normally in their respective fields (course details, faculty, assessments, etc.)
- For topics: Collect ALL topics from the document (from all units, weeks, sections) and combine them into a SINGLE "topics" array
- If topics appear in multiple places (Unit 1 topics, Unit 2 topics, weekly topics, etc.), merge them all into one "topics" array
- Each topic should be a string in the array
- Example: {{"course_code": "...", "faculty": {{...}}, "topics": ["Topic from Unit 1", "Topic from Unit 2", "Topic from Week 1", ...], "assessments": {{...}}}}

Return the extracted information as valid JSON. Be comprehensive but concise.

DOCUMENT:
---
{text}
---

Return ONLY valid JSON format. No additional text or explanation. Keep all other fields separate, but combine ALL topics into one "topics" array."""
    
    models_to_try = ["gpt-4o-mini"]
    current_model_idx = 0
    last_error = None
    max_tokens_options = [4096, 2048, 1024]  # Try reducing tokens if needed
    current_max_tokens_idx = 0
    successful_model = None  # Track which model succeeded
    
    for attempt in range(1, max_retries + 1):
        try:
            # Try current model with current max_tokens setting
            model = models_to_try[current_model_idx]
            max_tokens = max_tokens_options[current_max_tokens_idx]
            
            if attempt > 1:
                print(f"üîÑ Retrying with {model} (attempt {attempt}/{max_retries}, max_tokens={max_tokens})...")
            
            message = client.chat.completions.create(
                model=model,
                max_tokens=max_tokens,
                messages=[{"role": "user", "content": prompt}]
            )
            
            response_text = message.choices[0].message.content.strip()
            successful_model = model  # Track successful model
            
            try:
                result = json.loads(response_text)
                # Return with model info outside JSON
                return {
                    "extracted_information": result,
                    "model_used": successful_model
                }
            except json.JSONDecodeError:
                return {
                    "extracted_information": {"extracted_content": response_text},
                    "model_used": successful_model
                }
        
        except RateLimitError as e:
            # Check if it's a quota issue vs rate limit
            error_str = str(e).lower()
            error_type = getattr(e, 'type', None)
            error_code = getattr(e, 'code', None)
            
            is_quota_issue = (
                "insufficient_quota" in error_str or 
                "quota" in error_str or
                (error_type and "insufficient_quota" in str(error_type).lower()) or
                (error_code and "insufficient_quota" in str(error_code).lower())
            )
            
            if is_quota_issue:
                last_error = f"Insufficient quota/credits: {str(e)}"
                print(f"üí≥ QUOTA ISSUE (attempt {attempt}/{max_retries}): Your API account has insufficient credits/quota.")
                print(f"   ‚ö†Ô∏è  Retrying won't help. Please add credits to your OpenAI account.")
                print(f"   üìù Visit: https://platform.openai.com/account/billing")
                # Still try a couple times with long waits in case credits were just added
                if attempt < max_retries:
                    wait_time = 30 * attempt  # Wait 30, 60, 90 seconds
                    print(f"   ‚è≥ Waiting {wait_time} seconds before retry (in case credits were added)...")
                    time.sleep(wait_time)
                    continue
            else:
                last_error = f"Rate limit exceeded: {str(e)}"
                print(f"‚è≥ Rate limit hit (attempt {attempt}/{max_retries}). Waiting longer...")
                if attempt < max_retries:
                    wait_time = retry_delay * (2 ** attempt)  # Exponential backoff: 10, 20, 40 seconds
                    print(f"   ‚è≥ Waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
        except APIConnectionError as e:
            last_error = f"Connection error: {str(e)}"
            print(f"üîå Connection error (attempt {attempt}/{max_retries}): {last_error}")
            if attempt < max_retries:
                time.sleep(retry_delay * attempt)
                continue
        except APIError as e:
            # Handle specific API errors
            last_error = str(e)
            status_code = getattr(e, 'status_code', None)
            error_type = getattr(e, 'type', None)
            error_code = getattr(e, 'code', None)
            
            # Also check error response if available
            if hasattr(e, 'response') and hasattr(e.response, 'status_code'):
                status_code = e.response.status_code
            
            if status_code:
                print(f"‚ùå API Error (attempt {attempt}/{max_retries}): HTTP {status_code} - {error_type or error_code or 'Unknown'}")
            else:
                print(f"‚ùå API Error (attempt {attempt}/{max_retries}): {last_error}")
            
            # Determine if we should retry based on status code or error content
            should_retry = False
            error_str_lower = last_error.lower()
            
            if status_code in [500, 502, 503, 504]:  # Server errors
                should_retry = True
            elif status_code == 429:  # Rate limit or quota
                # Check if it's quota vs rate limit
                is_quota = (
                    error_type and "insufficient_quota" in str(error_type).lower()
                ) or (
                    error_code and "insufficient_quota" in str(error_code).lower()
                ) or "quota" in last_error.lower()
                
                if is_quota:
                    print(f"   üí≥ This appears to be a QUOTA issue, not just rate limiting.")
                    print(f"   ‚ö†Ô∏è  Please check your OpenAI account billing: https://platform.openai.com/account/billing")
                    # Still retry but with longer waits
                    should_retry = attempt < 2  # Only retry once for quota issues
                    retry_delay = 60  # Wait 60 seconds
                else:
                    should_retry = True
                    retry_delay = 10  # Shorter wait for rate limits
            elif error_type and "internal_error" in str(error_type).lower():
                should_retry = True
            elif error_code and "internal_error" in str(error_code).lower():
                should_retry = True
            elif "500" in last_error or "internal_error" in error_str_lower:
                should_retry = True
            
            if should_retry and attempt < max_retries:
                # Strategy: Try different models first, then reduce max_tokens, then wait longer
                if current_model_idx < len(models_to_try) - 1:
                    # Try next model
                    current_model_idx += 1
                    print(f"üîÑ Switching to {models_to_try[current_model_idx]}...")
                elif current_max_tokens_idx < len(max_tokens_options) - 1:
                    # Try reducing max_tokens
                    current_max_tokens_idx += 1
                    current_model_idx = 0  # Reset to first model
                    print(f"üîÑ Reducing max_tokens to {max_tokens_options[current_max_tokens_idx]} and retrying...")
                else:
                    # Exponential backoff: wait longer each time
                    wait_time = retry_delay * (2 ** (attempt - 1))  # 5, 10, 20 seconds
                    print(f"‚è≥ Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                continue
            else:
                break
        except Exception as e:
            # Catch any other unexpected errors (including string-based 500 errors)
            last_error = str(e)
            error_str_lower = last_error.lower()
            print(f"‚ùå Error (attempt {attempt}/{max_retries}): {last_error}")
            
            # Retry on internal server errors (check for 500 in error string)
            should_retry = False
            if "500" in last_error or "internal_error" in error_str_lower:
                should_retry = True
            elif "server error" in error_str_lower:
                should_retry = True
            
            if should_retry and attempt < max_retries:
                # Strategy: Try different models first, then reduce max_tokens, then wait longer
                if current_model_idx < len(models_to_try) - 1:
                    current_model_idx += 1
                    print(f"üîÑ Switching to {models_to_try[current_model_idx]}...")
                elif current_max_tokens_idx < len(max_tokens_options) - 1:
                    current_max_tokens_idx += 1
                    current_model_idx = 0  # Reset to first model
                    print(f"üîÑ Reducing max_tokens to {max_tokens_options[current_max_tokens_idx]} and retrying...")
                else:
                    wait_time = retry_delay * (2 ** (attempt - 1))  # Exponential backoff
                    print(f"‚è≥ Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                continue
            break
    
    # If we reach here, all retries failed
    # Try one final fallback: chunk the text if it's long
    if original_text_length > 50000 and "500" in str(last_error):
        print(f"\n‚ö†Ô∏è  All retries failed. Attempting fallback: processing text in chunks...")
        try:
            # Split text into chunks and process separately
            chunk_size = 40000
            chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
            results = []
            
            for i, chunk in enumerate(chunks):
                print(f"  Processing chunk {i+1}/{len(chunks)}...")
                try:
                    message = client.chat.completions.create(
                        model="gpt-3.5-turbo",
                        max_tokens=1024,
                        messages=[{"role": "user", "content": f"Extract key information from this document chunk. Return as JSON:\n\n{chunk}"}]
                    )
                    chunk_result = json.loads(message.choices[0].message.content.strip())
                    results.append(chunk_result)
                    time.sleep(2)  # Small delay between chunks
                except:
                    pass
            
            if results:
                return {
                    "extracted_information": {"extracted_content": results, "note": "Extracted from chunks due to API errors"},
                    "model_used": "gpt-3.5-turbo"
                }
        except Exception as e:
            print(f"  Fallback also failed: {e}")
    
    # Final error return with specific guidance
    error_msg = last_error or "Unknown error during AI extraction"
    error_str_lower = str(error_msg).lower()
    
    # Provide specific guidance based on error type
    if "quota" in error_str_lower or "insufficient" in error_str_lower:
        note = "QUOTA/CREDITS ISSUE: Your OpenAI account has insufficient credits. Please add credits at https://platform.openai.com/account/billing"
    elif "429" in error_msg or "rate limit" in error_str_lower:
        note = "Rate limit exceeded. Wait a few minutes and try again, or check your usage limits."
    elif "500" in error_msg or "internal_error" in error_str_lower:
        note = "OpenAI server error. This may be temporary. Wait a few minutes and try again."
    else:
        note = "All retry attempts failed. Check your API key/account status or try again later."
    
    return {
        "error": error_msg,
        "retries_attempted": max_retries,
        "note": note,
        "model_used": None  # No model succeeded
    }

In [12]:
# file_path = "document.pdf"  # Change this to your actual file
# # extraction_task = "Extract all important information"  # Change to your needs

# # Example 1: Extract from PDF
# # print("=" * 60)
# # print("Extracting from PDF...")
# # print("=" * 60)
# # pdf_text = extract_text("document.pdf")
# # print(f"Extracted {len(pdf_text)} characters\n")

# # # Extract information with custom instructions
# # result = extract_information_ai(
# #     pdf_text,
# #     extraction_instructions="Extract all important facts, dates, names, and numbers"
# # )
# # print(json.dumps(result, indent=2))

# # Example 2: Extract from Word document
# print("\n" + "=" * 60)
# print("Extracting from Word document...")
# print("=" * 60)
# docx_text = extract_text("document.docx")
# print(f"Extracted {len(docx_text)} characters\n")

# result = extract_information_ai(
#     docx_text,
#     extraction_instructions="Extract key information and details"
# )
# print(json.dumps(result, indent=2))

# # # Example 3: Extract from Image
# # print("\n" + "=" * 60)
# # print("Extracting from Image...")
# # print("=" * 60)
# # img_text = extract_text("image.jpg")
# # print(f"Extracted {len(img_text)} characters\n")

# # result = extract_information_ai(
# #     img_text,
# #     extraction_instructions="Extract all text and important information"
# # )
# # print(json.dumps(result, indent=2))

In [13]:
file_path = "./document.docx"

print(f"Extracting text from: {file_path}")
extracted_text = extract_text(file_path)
print(f"Extracted {len(extracted_text)} characters\n")
print("=" * 60)
print("EXTRACTED TEXT:")
print("=" * 60)
print(extracted_text)
print("=" * 60)

Extracting text from: ./document.docx
Extracted 21096 characters

EXTRACTED TEXT:
BCA 301-4 DOT NET COURSE PLAN
SECTION I
LAB EXERCISES
SECTION III
Mapping: 
 Mapping of needs with the syllabus:
Please map the COs and Unit/Topic details to the following wherever applicable. If not applicable, please enter that.
Mapping:  A template to map the Learning Outcomes of the course against the components of assessment is given below:
Assessment outline:
 SECTION IV
Assignment Component:
A template to map the Course Outcomes against the components of the  assessment is given below:
Assessment outline:
Assessment Description: CIA ‚Äì I
Evaluation Rubrics:
CIA ‚Äì II (Mid Semester Exam)- 50 Marks-Centralized
Assessment Description: ESE - I 
 Evaluation rubrics:
 Mapping the Learning Outcomes with components of the evaluation rubrics:
Assessment Description: ESE ‚Äì II 
Evaluation Rubrics:
Assessment Description: ESE ‚Äì III
 Evaluation Rubrics for project:
* * * * * * * * * 
Semester | IV | Progr

In [None]:
print("\nExtracting information with AI...")
result = extract_information_ai(extracted_text)
print("\n" + "=" * 60)
print("EXTRACTED INFORMATION:")
print("=" * 60)

# Display model used separately (outside JSON)
if "model_used" in result and result["model_used"]:
    print(f"ü§ñ Model Used: {result['model_used']}")
    print("=" * 60)
    print("\nüìÑ Extracted Data (JSON):")
    print("=" * 60)

# Display the extracted information (JSON)
if "extracted_information" in result:
    print(json.dumps(result["extracted_information"], indent=2))
elif "error" in result:
    print(json.dumps(result, indent=2))
else:
    print(json.dumps(result, indent=2))



Extracting information with AI...

EXTRACTED INFORMATION:
ü§ñ Model Used: gpt-4o-mini

üìÑ Extracted Data (JSON):
{
  "extracted_content": "```json\n{\n  \"course_code\": \"BCA 301-4\",\n  \"faculty\": {\n    \"name\": \"Dr. Madan Singh\",\n    \"contact\": {\n      \"mobile\": \"8375912880\",\n      \"email\": \"madan.singh@christuniversity.in\"\n    }\n  },\n  \"course_title\": \"Dot Net\",\n  \"course_description\": \"This course provides an in-depth understanding of .NET technologies, focusing on C# and VB.NET for Windows Forms applications. Students will explore .NET Framework, CLR, OOPS concepts, Windows-based application development, database integration using ADO.NET, file handling, security, and deployment. The course emphasizes hands-on learning through practical lab exercises and culminates in a real-world mini-project aligned with SDG\u2019s and social initiatives.\",\n  \"course_objectives\": \"This course equips students with .NET development skills, focusing on C# and