<a href="https://colab.research.google.com/github/ReneeB-prog/DrleeWebSLM/blob/main/LangExtract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# Make sure to add your GEMINI_API_KEY in Colab secrets first!

# Step 1: Install all dependencies
!pip install -q google-generativeai gradio pandas requests

In [33]:
# Step 2: Import all libraries
import os
import json
import pandas as pd
import gradio as gr
import requests
from pathlib import Path
from google.colab import userdata
from google import generativeai as genai
from typing import List, Optional
from dataclasses import dataclass, field
import io

In [53]:
# Step 3: Configure Gemini
print("üîÑ Configuring Gemini...")
api_key = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.5-flash')
print("‚úÖ Gemini configured successfully!")

üîÑ Configuring Gemini...
‚úÖ Gemini configured successfully!


In [44]:
# Step 4: Download sample books from Project Gutenberg
print("\nüìö Downloading sample books...")
book_urls = [
    "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
    "https://www.gutenberg.org/files/11/11-0.txt",      # Alice in Wonderland
    "https://www.gutenberg.org/files/84/84-0.txt",       # Frankenstein
    "https://www.gutenberg.org/files/1661/1661-0.txt",   # Sherlock Holmes
    "https://www.gutenberg.org/files/98/98-0.txt",       # Tale of Two Cities
    "https://www.gutenberg.org/files/844/844-0.txt",     # Dorian Gray
    "https://www.gutenberg.org/files/2701/2701-0.txt",   # Moby Dick
    "https://www.gutenberg.org/files/345/345-0.txt",     # Dracula
    "https://www.gutenberg.org/files/1232/1232-0.txt",   # The Prince
    "https://www.gutenberg.org/files/2591/2591-0.txt"    # Grimm's Fairy Tales
]

books_content = {}
for url in book_urls:
    try:
        response = requests.get(url)
        book_name = url.split('/')[-1].replace('-0.txt', '')
        books_content[book_name] = response.text[:50000]  # First 50k chars
        print(f"  ‚úì Downloaded book ID: {book_name}")
    except:
        print(f"  ‚úó Failed to download: {url}")

print(f"\n‚úÖ Downloaded {len(books_content)} books successfully!")


üìö Downloading sample books...
  ‚úì Downloaded book ID: 1342
  ‚úì Downloaded book ID: 11
  ‚úì Downloaded book ID: 84
  ‚úì Downloaded book ID: 1661
  ‚úì Downloaded book ID: 98
  ‚úì Downloaded book ID: 844
  ‚úì Downloaded book ID: 2701
  ‚úì Downloaded book ID: 345
  ‚úì Downloaded book ID: 1232
  ‚úì Downloaded book ID: 2591

‚úÖ Downloaded 10 books successfully!


In [45]:
# Step 5: Define extraction schema
@dataclass
class BookMetadata:
    """Schema for extracting structured data from books"""
    title: str
    author: str
    publication_year: Optional[int]
    main_characters: List[str]
    setting_location: str
    genre: str
    plot_summary: str
    major_themes: List[str]
    opening_line: str
    target_audience: str

def create_extraction_prompt(text_sample):
    """Create a structured prompt for Gemini to extract book metadata"""

    prompt = f"""
    Analyze this book excerpt and extract the following information.
    Return ONLY a valid JSON object with these exact fields:

    {{
        "title": "exact title of the book",
        "author": "full author name",
        "publication_year": year as integer or null,
        "main_characters": ["character1", "character2", ...] (max 5),
        "setting_location": "primary geographic setting",
        "genre": "primary genre classification",
        "plot_summary": "concise plot summary in exactly 100 words",
        "major_themes": ["theme1", "theme2", ...] (max 3),
        "opening_line": "the memorable opening line",
        "target_audience": "intended reader demographic"
    }}

    Book excerpt:
    {text_sample}

    JSON Output:
    """
    return prompt

In [61]:
# Step 6: Create extraction functions
def extract_with_langextract(text, progress_callback=None):
    """Extract structured data using LangExtract pattern with Gemini"""

    try:
        # Generate extraction prompt
        prompt = create_extraction_prompt(text)

        # Call Gemini for extraction
        response = model.generate_content(
            prompt,
            generation_config={
                "temperature": 0.1,
                "top_p": 0.95,
                "max_output_tokens": 2048, # Increased max_output_tokens
            }
        )

        # Check if response is valid and not blocked
        if not response.candidates or not response.candidates[0].content.parts:
             if response.prompt_feedback and response.prompt_feedback.block_reason:
                print(f"  ‚úó Extraction error: Content blocked due to: {response.prompt_feedback.block_reason}")
             else:
                print(f"  ‚úó Extraction error: No valid content in response. Full response: {response}") # Added full response print
             return None

        # Parse JSON response
        json_text = response.candidates[0].content.parts[0].text
        if "```json" in json_text:
            json_text = json_text.split("```json")[1].split("```")[0]
        elif "```" in json_text:
            json_text = json_text.split("```")[1].split("```")[0]
        else:
            # If no code block is found, assume the entire response is JSON
            pass


        extracted_data = json.loads(json_text.strip())

        if progress_callback:
            progress_callback()

        return extracted_data

    except Exception as e:
        print(f"  ‚úó Extraction error: {e}")
        return None

def batch_extract_books(books_dict):
    """Process multiple books and return structured DataFrame"""

    results = []
    total_books = len(books_dict)
    # Define the expected keys based on the BookMetadata dataclass
    expected_keys = [f.name for f in BookMetadata.__dataclass_fields__.values()]


    for idx, (book_id, content) in enumerate(books_dict.items(), 1):
        print(f"  üîÑ Processing book {idx}/{total_books}: {book_id}")

        extracted = extract_with_langextract(content)
        if extracted:
            # Ensure all expected keys are present, fill missing with None
            processed_data = {key: extracted.get(key) for key in expected_keys}
            processed_data['book_id'] = book_id
            results.append(processed_data)
            print(f"    ‚úì Successfully extracted: {extracted.get('title', 'Unknown')}")
            print(f"    Raw extracted data for {book_id}: {extracted}") # Added print statement
        else:
            print(f"    ‚úó Extraction failed for book: {book_id}")


    df = pd.DataFrame(results)
    return df

In [47]:
def process_uploaded_file(file):
    """Process uploaded text file through LangExtract"""

    if file is None:
        return None, "Please upload a file"

    # Read file content
    content = file.read().decode('utf-8')[:50000]  # Limit to 50k chars

    # Extract data
    result = extract_with_langextract(content)

    if result:
        # Convert to DataFrame for display
        df = pd.DataFrame([result])

        # Create downloadable CSV
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False)
        csv_content = csv_buffer.getvalue()

        return df, csv_content
    else:
        return None, "Extraction failed. Please check your document."
def create_gradio_app():
    """Create the Gradio interface for LangExtract"""

    with gr.Blocks(title="LangExtract Data Extractor", theme=gr.themes.Soft()) as app:

        gr.Markdown("""
        # üìö LangExtract Document Intelligence Platform
        ### Transform any document into structured, sellable data in seconds

        Upload any text document (books, reports, articles) and watch as AI extracts
        valuable structured information instantly.
        """)

        with gr.Tab("Single Document Extraction"):
            with gr.Row():
                with gr.Column(scale=1):
                    file_input = gr.File(
                        label="Upload Document (.txt)",
                        file_types=[".txt"]
                    )
                    extract_btn = gr.Button("üöÄ Extract Data", variant="primary")

                with gr.Column(scale=2):
                    output_df = gr.Dataframe(
                        label="Extracted Structured Data",
                        headers=["Field", "Value"],
                    )
                    download_csv = gr.File(label="üì• Download CSV")

            extract_btn.click(
                fn=process_uploaded_file,
                inputs=file_input,
                outputs=[output_df, download_csv]
            )

        with gr.Tab("Batch Processing"):
            gr.Markdown("### Process Multiple Documents")

            batch_upload = gr.File(
                label="Upload Multiple Documents",
                file_count="multiple",
                file_types=[".txt"]
            )

            batch_btn = gr.Button("üîÑ Process All Documents", variant="primary")
            batch_output = gr.Dataframe(label="Batch Results")
            batch_download = gr.File(label="üì• Download All Results")

        with gr.Tab("Live Demo"):
            gr.Markdown("### Try with Sample Books")

            sample_btn = gr.Button("üìö Load & Process Sample Books", variant="secondary")
            demo_output = gr.Dataframe(
                label="Sample Extraction Results",
                value=extracted_df if 'extracted_df' in globals() else None,
            )

            sample_btn.click(
                fn=lambda: extracted_df,
                outputs=demo_output
            )

        gr.Markdown("""
        ---
        üí° **Business Tip**: Each extracted record can be worth $5-50 when properly
        structured and validated. Process 1,000 documents = $5,000-50,000 in data value.

        üîó **API Integration**: Export to JSON/CSV for direct integration with your
        data products or client systems.
        """)

    return app
# Launch the app
app = create_gradio_app()
app.launch(share=True, height=800)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8787db1913d1864a33.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [75]:
# Step 7: Process the downloaded books
print("\nüöÄ Starting extraction process...")
extracted_df = batch_extract_books(books_content)
print(f"\nüìä Successfully extracted data from {len(extracted_df)} books")

# Display sample results
if len(extracted_df) > 0:
    print("\nüìã Sample extracted data:")
    print(extracted_df[['title', 'author', 'genre']].head())
else:
    print("\n‚ùå No data extracted. Please check the extraction process.")


üöÄ Starting extraction process...
  üîÑ Processing book 1/10: 1342
  ‚úó Extraction error: No valid content in response. Full response: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "role": "model"
          },
          "finish_reason": "MAX_TOKENS",
          "index": 0
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 12492,
        "total_token_count": 14539,
        "cached_content_token_count": 11412
      },
      "model_version": "gemini-2.5-flash"
    }),
)
    ‚úó Extraction failed for book: 1342
  üîÑ Processing book 2/10: 11
    ‚úì Successfully extracted: Alice‚Äôs Adventures in Wonderland
    Raw extracted data for 11: {'title': 'Alice‚Äôs Adventures in Wonderland', 'author': 'Lewis Carroll', 'publication_year': None, 'main_characters': ['Alice', 'White Rabbit', 'Mouse', 'Dodo', 'Caterpillar'], 'setting_locatio

In [76]:
# List available models to find a suitable one
print("üîÑ Listing available Gemini models...")
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(f"- {m.name}")
print("‚úÖ Finished listing models.")

üîÑ Listing available Gemini models...
- models/gemini-2.5-pro-preview-03-25
- models/gemini-2.5-flash-preview-05-20
- models/gemini-2.5-flash
- models/gemini-2.5-flash-lite-preview-06-17
- models/gemini-2.5-pro-preview-05-06
- models/gemini-2.5-pro-preview-06-05
- models/gemini-2.5-pro
- models/gemini-2.0-flash-exp
- models/gemini-2.0-flash
- models/gemini-2.0-flash-001
- models/gemini-2.0-flash-exp-image-generation
- models/gemini-2.0-flash-lite-001
- models/gemini-2.0-flash-lite
- models/gemini-2.0-flash-preview-image-generation
- models/gemini-2.0-flash-lite-preview-02-05
- models/gemini-2.0-flash-lite-preview
- models/gemini-2.0-pro-exp
- models/gemini-2.0-pro-exp-02-05
- models/gemini-exp-1206
- models/gemini-2.0-flash-thinking-exp-01-21
- models/gemini-2.0-flash-thinking-exp
- models/gemini-2.0-flash-thinking-exp-1219
- models/gemini-2.5-flash-preview-tts
- models/gemini-2.5-pro-preview-tts
- models/learnlm-2.0-flash-experimental
- models/gemma-3-1b-it
- models/gemma-3-4b-it
- 

In [82]:
# Generate multiple export formats
def create_data_products(df):
    """Generate various formats for different buyer needs"""

    # JSON for APIs
    json_product = df.to_json(orient='records', indent=2)

    # CSV for spreadsheet users
    csv_product = df.to_csv(index=False)

    # SQL for database integration
    sql_statements = []
    if not df.empty:
        for _, row in df.iterrows():
            # Convert all values to string and handle potential NaNs
            values = []
            for v in row.values:
                if isinstance(v, list):
                    # Convert list to string representation
                    str_v = str(v)
                elif pd.notna(v): # Check if not NaN for non-list values
                     str_v = str(v)
                else:
                    str_v = '' # Append empty string for NaN non-list values

                values.append(str_v)

            sql = f"INSERT INTO book_metadata VALUES ('{', '.join(values)}');"
            sql_statements.append(sql)
    sql_product = '\n'.join(sql_statements)

    # Analytics summary
    summary = {
        'total_records': len(df),
        'genres': df['genre'].value_counts().to_dict() if 'genre' in df and 'genre' in df.columns and not df['genre'].dropna().empty else {},
        'avg_themes_per_book': df['major_themes'].apply(len).mean() if 'major_themes' in df and 'major_themes' in df.columns and not df['major_themes'].dropna().empty and df['major_themes'].apply(len).sum() > 0 else 0,
        'publication_range': f"{df['publication_year'].min()}-{df['publication_year'].max()}" if 'publication_year' in df and 'publication_year' in df.columns and not df['publication_year'].dropna().empty else "N/A",
        'authors': df['author'].nunique() if 'author' in df and 'author' in df.columns and not df['author'].dropna().empty else 0,
        'records_value_estimate': f"${len(df) * 10}-${len(df) * 50}"
    }


    return {
        'json': json_product,
        'csv': csv_product,
        'sql': sql_product,
        'summary': summary
    }
# Generate all formats
products = create_data_products(extracted_df)
print("üì¶ Data Products Generated:")
print(f"  ‚Ä¢ JSON API Feed: {len(products['json'])} bytes")
print(f"  ‚Ä¢ CSV Database: {len(products['csv'])} bytes")
print(f"  ‚Ä¢ SQL Import: {len(products['sql'])} bytes")
print(f"  ‚Ä¢ Analytics Summary: {products['summary']}")

üì¶ Data Products Generated:
  ‚Ä¢ JSON API Feed: 11684 bytes
  ‚Ä¢ CSV Database: 9506 bytes
  ‚Ä¢ SQL Import: 9665 bytes
  ‚Ä¢ Analytics Summary: {'total_records': 8, 'genres': {"Children's Fantasy": 1, 'Gothic fiction': 1, 'Detective Fiction': 1, 'Comedy': 1, 'Adventure': 1, 'Gothic Horror': 1, 'Political Philosophy': 1, 'Fairy Tales': 1}, 'avg_themes_per_book': np.float64(3.0), 'publication_range': '1513.0-2002.0', 'authors': 8, 'records_value_estimate': '$80-$400'}


In [83]:
from google.colab import drive
drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive/LangExtract_Products'
os.makedirs(output_dir, exist_ok=True)
# Save all formats
with open(f'{output_dir}/books_data.json', 'w') as f:
    f.write(products['json'])
with open(f'{output_dir}/books_data.csv', 'w') as f:
    f.write(products['csv'])
with open(f'{output_dir}/books_import.sql', 'w') as f:
    f.write(products['sql'])
print(f"‚úÖ Data products saved to Google Drive: {output_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Data products saved to Google Drive: /content/drive/MyDrive/LangExtract_Products


In [84]:
# Step 10: Build complete Gradio interface
print("\nüé® Building Gradio interface...")

def process_batch_files(files):
    """Placeholder function for batch processing"""
    # This function will be implemented later
    return pd.DataFrame(), None, "Batch processing is not yet fully implemented."


with gr.Blocks(title="LangExtract Data Extractor", theme=gr.themes.Soft()) as app:

    gr.Markdown("""
    # üìö LangExtract Document Intelligence Platform
    ### Transform any document into structured, sellable data in seconds

    Upload any text document (books, reports, articles) and watch as AI extracts
    valuable structured information instantly.

    **üí∞ Business Value**: Each extracted record can be worth $10-50 when properly
    structured. Process 100 documents = $1,000-5,000 in data value!
    """)

    with gr.Tab("üéØ Single Document Extraction"):
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(
                    label="Upload Document (.txt)",
                    file_types=[".txt"]
                )
                extract_btn = gr.Button("üöÄ Extract Data", variant="primary", size="lg")
                status_single = gr.Textbox(label="Status", interactive=False)

            with gr.Column(scale=2):
                output_df = gr.Dataframe(
                    label="Extracted Structured Data"
                )
                download_csv = gr.File(label="üì• Download CSV")

        extract_btn.click(
            fn=process_uploaded_file,
            inputs=file_input,
            outputs=[output_df, download_csv, status_single]
        )

    with gr.Tab("üì¶ Batch Processing"):
        gr.Markdown("### Process Multiple Documents at Once")

        with gr.Row():
            with gr.Column(scale=1):
                batch_upload = gr.File(
                    label="Upload Multiple Documents",
                    file_count="multiple",
                    file_types=[".txt"]
                )
                batch_btn = gr.Button("üîÑ Process All Documents", variant="primary", size="lg")
                status_batch = gr.Textbox(label="Status", interactive=False)

            with gr.Column(scale=2):
                batch_output = gr.Dataframe(
                    label="Batch Results"
                )
                batch_download = gr.File(label="üì• Download All Results")

        batch_btn.click(
            fn=process_batch_files,
            inputs=batch_upload,
            outputs=[batch_output, batch_download, status_batch]
        )

    with gr.Tab("üìä Live Demo"):
        gr.Markdown("### Pre-loaded Sample Books from Project Gutenberg")

        demo_output = gr.Dataframe(
            label="Sample Extraction Results",
            value=extracted_df if len(extracted_df) > 0 else None
        )

        if len(extracted_df) > 0:
            # Generate data products
            products = create_data_products(extracted_df)

            gr.Markdown(f"""
            ### üìà Data Analytics Summary
            - **Total Records**: {products['summary']['total_records']}
            - **Unique Authors**: {products['summary']['authors']}
            - **Estimated Value**: {products['summary']['records_value_estimate']}
            - **Genres Found**: {', '.join(list(products['summary']['genres'].keys())[:5])}
            """)

            # Create download links for different formats
            with gr.Row():
                with gr.Column():
                    # Save JSON file
                    json_file = "books_data.json"
                    with open(json_file, 'w') as f:
                        f.write(products['json'])
                    gr.File(value=json_file, label="üìÑ Download JSON")

                with gr.Column():
                    # Save SQL file
                    sql_file = "books_import.sql"
                    with open(sql_file, 'w') as f:
                        f.write(products['sql'])
                    gr.File(value=sql_file, label="üóÑÔ∏è Download SQL")

                with gr.Column():
                    # Save CSV file
                    csv_file = "books_data.csv"
                    with open(csv_file, 'w') as f:
                        f.write(products['csv'])
                    gr.File(value=csv_file, label="üìä Download CSV")

            gr.Markdown("""
            ### üí° Business Applications
            - Sell to libraries for catalog enrichment
            - License to AI training companies
            - Create genre-specific book recommendation APIs
            - Build literary analysis tools for education
            """)

    with gr.Tab("üí∞ Business Opportunities"):
        gr.Markdown("""
        ### 20 Profitable Data Extraction Businesses You Can Start Today

        #### üè• Healthcare ($5,000-50,000/month potential)
        1. **Clinical Trial Database** - Extract from ClinicalTrials.gov
        2. **Medical Device Events** - Process FDA MAUDE database
        3. **Insurance Requirements** - Structure policy PDFs

        #### üèõÔ∏è Government Intelligence ($10,000-100,000/month potential)
        4. **Federal Contracts** - Mine SAM.gov requirements
        5. **Municipal Minutes** - Extract city council decisions
        6. **Grant Matching** - Structure Grants.gov opportunities

        #### ‚öñÔ∏è Legal Services ($15,000-150,000/month potential)
        7. **Case Law Precedents** - Process Google Scholar decisions
        8. **Contract Clauses** - Extract from SEC EDGAR filings
        9. **Patent Claims** - Structure USPTO patents

        #### üéì Education Market ($3,000-30,000/month potential)
        10. **Syllabus Aggregator** - Extract from Open Syllabus
        11. **Research Methods** - Mine arXiv papers
        12. **Job Requirements** - Analyze HigherEdJobs posts

        #### üìà Business Intelligence ($20,000-200,000/month potential)
        13. **Earnings Insights** - Process Seeking Alpha transcripts
        14. **Review Sentiment** - Structure Amazon reviews
        15. **Job Trends** - Analyze LinkedIn postings

        #### üöö Supply Chain ($8,000-80,000/month potential)
        16. **Shipping Data** - Process ImportYeti documents
        17. **Recall Database** - Structure CPSC notices
        18. **ESG Metrics** - Extract sustainability reports

        #### üéØ Niche Markets ($2,000-20,000/month potential)
        19. **Recipe Database** - Extract from food blogs
        20. **Real Estate Intel** - Structure Zillow listings

        ---

        **Start Today**: Pick one niche, extract 100 documents, find 3 buyers.
        Scale from there!
        """)

    gr.Markdown("""
    ---
    üöÄ **Ready to build your data extraction business?** This platform is your complete toolkit.

    üìß Questions? Visit [drlee.io](https://drlee.io) | Built with LangExtract + Gemini
    """)

# Launch the application
print("\nüöÄ Launching Gradio app...")
print("üì± Your app will open in a new tab with a public URL you can share!")
app.launch(share=True)

# Display summary
print("\n" + "="*60)
print("üéâ SYSTEM SUCCESSFULLY DEPLOYED!")
print("="*60)
print(f"‚úÖ Books processed: {len(extracted_df)}")
print(f"üíæ Data ready for export in JSON, CSV, and SQL formats")
print(f"üåê Share your public URL to demonstrate the system")
print(f"üí∞ Potential value: ${len(extracted_df) * 10} - ${len(extracted_df) * 50}")
print("="*60)


üé® Building Gradio interface...

üöÄ Launching Gradio app...
üì± Your app will open in a new tab with a public URL you can share!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8d910480363fb2fa1b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



üéâ SYSTEM SUCCESSFULLY DEPLOYED!
‚úÖ Books processed: 8
üíæ Data ready for export in JSON, CSV, and SQL formats
üåê Share your public URL to demonstrate the system
üí∞ Potential value: $80 - $400
