In [None]:
%pip install anthropic
%pip install pandas
%pip install numpy

In [3]:
%pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [109]:
import os
from dotenv import load_dotenv

load_dotenv()
ANTHROPIC_API_KEY=os.getenv('ANTHROPIC_API_KEY')
DEFAULT_MODEL=os.getenv('DEFAULT_MODEL')


In [110]:
import anthropic
client=anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)



In [103]:
def build_classification_prompt(tender_contents):
    """
    Builds an improved classification prompt for flood/non-flood tender classification
    """
    return f'''You are a specialized tender classification system for identifying flood-related infrastructure and development projects. Your task is to analyze tender details and determine whether they are flood-related or non-flood-related, providing clear reasoning for your classification.

INPUT TENDER:
<tender>{tender_contents}</tender>

CLASSIFICATION GUIDELINES:

1. Primary Indicators (High confidence markers):
- Direct flood protection/prevention works
- Flood damage restoration
- Flood-related infrastructure (embankments, dykes, etc.)
- Emergency response facilities for floods
- Drainage systems in flood-prone areas

2. Secondary Indicators (Context-dependent):
- Water management projects
- Infrastructure reinforcement in flood zones
- Road/bridge repairs mentioning rain/water damage
- Erosion control measures
- Watershed management

3. Temporal Factors to Consider:
- Pre-monsoon preparation works
- Post-flood restoration
- Seasonal timing of the tender
- Emergency vs. planned works

4. Key Terms Analysis:
Positive indicators:
- Flood protection/control
- Embankment/dyke construction
- Storm water management
- Erosion control
- Drainage systems
- SDRF (State Disaster Response Fund) projects
- Inundation prevention

Negative indicators (likely non-flood):
- Regular construction
- Routine maintenance
- Unrelated infrastructure (unless specifically flood-protection)
- General development works
- Standard civic amenities

CLASSIFICATION TASK:

1. First, analyze the tender details and provide your reasoning within <reasoning> tags. Consider:
   - Primary purpose of the work
   - Presence of flood-related keywords and context
   - Temporal factors (season, urgency)
   - Geographic relevance
   - Project scale and scope
   - Department/agency involved
   - Any ambiguity in classification

2. Then, output EXACTLY ONE classification label within <intent> tags:
   <intents>
   <intent>Flood</intent>
   <intent>Non-Flood</intent>
   <intent>Ambiguous </intent>
   </intents>

3. In case of ambiguity:
   - Prioritize flood classification if there's clear flood-prevention/mitigation aspect
   - Default to non-flood if flood relation is peripheral or unclear
   - Document uncertainty in reasoning

Example Classifications:

1. Clear Flood Case:
<reasoning>
Tender explicitly mentions flood protection works, includes embankment construction, 
and is scheduled pre-monsoon. Department is water resources, indicating flood management focus.
</reasoning>
<intent>Flood</intent>

2. Ambiguous Case:
<reasoning>
While tender includes drainage works, it appears to be part of routine road construction 
rather than specific flood management. No explicit flood prevention purpose mentioned.
</reasoning>
<intent>Ambiguous</intent>

3. Clear Non-Flood Case:
<reasoning>
Standard building construction tender for government office. 
No flood-related components or considerations mentioned.
</reasoning>
<intent>Non-Flood</intent>

YOUR CLASSIFICATION:
Please analyze the provided tender and provide your classification following the above format.
'''

In [104]:
# define a classifiy tender function with reasoning, intent and evaluation
import re
def classify_tenders(tender_contents):
    classification_prompt=build_classification_prompt(tender_contents)
    message=client.messages.create(
        model=DEFAULT_MODEL,
        max_tokens=512,
        temperature=0,
        messages=[{"role": "user", "content": classification_prompt}],
        stream=False
    )
    # get usage statistics
    # usage=message.usage
    # extract the reasoning and the content
    reasoning_and_intent=message.content[0].text
    reasoning_match=re.search(
              r"<reasoning>(.*?)</reasoning>", reasoning_and_intent, re.DOTALL
    )
    reasoning = reasoning_match.group(1).strip() if reasoning_match else ""

    # Similarly, also extract the `intent`.
    intent_match = re.search(r"<intent>(.*?)</intent>", reasoning_and_intent, re.DOTALL)
    intent = intent_match.group(1).strip() if intent_match else ""

    return reasoning, intent
    
    

In [52]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [105]:
import pandas as pd
from typing import Dict, List, Tuple
import time
from concurrent.futures import ThreadPoolExecutor
import logging
from tqdm import tqdm
import os

def process_csv_tenders(
        csv_path: str,
        output_path: str,
        batch_size: int = 10,
        max_retries: int = 1,
        delay_between_batches: float = 1.0):
    """
    Process a CSV file of tenders and classify each tender using the classify_tenders function.
    Writes results to CSV after each batch.

    Args:
        csv_path: Path to the CSV file containing tenders
        output_path: Path where to save the output CSV
        batch_size: Number of tenders to process in parallel
        max_retries: Maximum number of retries for failed classifications
        delay_between_batches: Delay in seconds between processing batches
    """
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename='tender_classification.log'
    )

    try:
        df = pd.read_csv(csv_path)
        logging.info(f"Successfully loaded CSV file with {len(df)} rows")
    except Exception as e:
        logging.error(f"Error loading CSV file: {str(e)}")
        raise

    # Create result columns
    df['classification_reasoning'] = ''
    df['classification_intent'] = ''
    df['classification_status'] = 'pending'
    df['classification_error'] = ''

    # Initialize output CSV with headers
    df.head(0).to_csv(output_path, index=False)
    processed_count = 0
    successful_count = 0
    failed_count = 0

    def process_single_tender(row_idx: int) -> Dict:
        """Process a single tender with retries"""
        row = df.iloc[row_idx]

        tender_contents = {
            'tender_id': row.get('Tender ID', ''),
            'title': row.get('tender_title', ''),
            'description': row.get('Work Description', ''),
            'department': row.get('Department', ''),
            'season': row.get('Season', ''),
            'keywords': row.get('positive_keywords_dict', {})
        }

        for attempt in range(max_retries):
            try:
                reasoning, intent = classify_tenders(tender_contents)
                return {
                    'idx': row_idx,
                    'reasoning': reasoning,
                    'intent': intent,
                    'status': 'success',
                    'error': ''
                }
            except Exception as e:
                if attempt == max_retries - 1:
                    error_msg = f"Failed after {max_retries} attempts: {str(e)}"
                    logging.error(f"Tender {row_idx} - {error_msg}")
                    return {
                        'idx': row_idx,
                        'reasoning': '',
                        'intent': '',
                        'status': 'failed',
                        'error': error_msg
                    }
                time.sleep(1)

    with ThreadPoolExecutor(max_workers=batch_size) as executor:
        for batch_start in tqdm(range(0, len(df), batch_size)):
            batch_end = min(batch_start + batch_size, len(df))
            batch_indices = range(batch_start, batch_end)
            
            # Process batch
            futures = [executor.submit(process_single_tender, idx) for idx in batch_indices]
            results = [future.result() for future in futures]
            
            # Update DataFrame with results for this batch
            batch_df = df.iloc[batch_start:batch_end].copy()
            for result in results:
                idx = result['idx'] - batch_start  # Relative index in batch
                batch_df.iloc[idx, batch_df.columns.get_loc('classification_reasoning')] = result['reasoning']
                batch_df.iloc[idx, batch_df.columns.get_loc('classification_intent')] = result['intent']
                batch_df.iloc[idx, batch_df.columns.get_loc('classification_status')] = result['status']
                batch_df.iloc[idx, batch_df.columns.get_loc('classification_error')] = result['error']
            
            # Append batch results to output CSV
            batch_df.to_csv(output_path, mode='a', header=False, index=False)
            
            # Update statistics
            successful = sum(1 for r in results if r['status'] == 'success')
            successful_count += successful
            failed_count += len(results) - successful
            processed_count += len(results)
            
            # Log batch progress
            logging.info(f"Batch {batch_start//batch_size + 1}: {successful}/{len(results)} successful")
            logging.info(f"Progress: {processed_count}/{len(df)} tenders processed")
            
            # Delay between batches
            if batch_end < len(df):
                time.sleep(delay_between_batches)
    
    # Log final summary
    logging.info(f"""
    Classification Summary:
    Total Processed: {processed_count}
    Successful: {successful_count}
    Failed: {failed_count}
    Output saved to: {output_path}
    """)


In [111]:
if __name__ == "__main__":
    # Configure parameters
    CSV_PATH = "/home/prajna/civicdatalab/himachal/tender-classifier/data/test_data/2022_08_tenders_100.csv"
    OUTPUT_PATH="/home/prajna/civicdatalab/himachal/tender-classifier/data/test_data/2022_08_classified_tenders_100.csv"
    BATCH_SIZE = 7  # Adjust based on API rate limits
    DELAY = 1.0  # Seconds between batches
    
    try:
        # Process the tenders
        results_csv = process_csv_tenders(
            csv_path=CSV_PATH,
            output_path=OUTPUT_PATH,
            batch_size=BATCH_SIZE,
            delay_between_batches=DELAY
        )
        
        # Save results
        # results_df.to_csv("/home/prajna/civicdatalab/himachal/tender-classifier/data/test_data/classified_tenders.csv", index=False)
        print("Classification completed successfully. Results saved to classified_tenders.csv")
        
    except Exception as e:
        print(f"Error processing tenders: {str(e)}")

100%|██████████| 59/59 [23:29<00:00, 23.89s/it]   

Classification completed successfully. Results saved to classified_tenders.csv



