In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

# Define the file path
csv_file_path = 'normify_clean.csv'

try:
    # Read the CSV file with proper encoding and handling of NULL values
    df = pd.read_csv(csv_file_path, 
            encoding='utf-8',
            na_values=['\\N', ''],  # Handle NULL values
            keep_default_na=True,
            delimiter=',')
    
    print("✅ CSV file loaded successfully!")
    print(f"�� Dataset shape: {df.shape}")
    print(f"📋 Columns: {list(df.columns)}")
    
except FileNotFoundError:
    print("❌ Error: File not found. Please check the file path.")
except Exception as e:
    print(f"❌ Error loading file: {str(e)}")

In [None]:
import requests
import json
import time
from datetime import datetime
import pandas as pd

# Configuration
API_BASE_URL = "https://app.normify.me"
API_ENDPOINT = f"{API_BASE_URL}/research/api/ultimate/"
API_TOKEN = "YOUR BEARER TOKEN"  # You'll need to get this from your Django admin

# Headers for authentication
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_TOKEN}"
}

# Function to call the API for a single row
def call_check_standard_version_api(row):
    """
    Call the CheckStandardVersionAPIView for a single row of data.
    
    Args:
        row: pandas Series containing the row data
        
    Returns:
        dict: API response data
    """
    # Prepare the request payload
    payload = {
        "id": int(row['id']) if pd.notna(row['id']) else None,
        "identifier": str(row['identifier']) if pd.notna(row['identifier']) else "",
        "type": str(row['type']) if pd.notna(row['type']) else "standard",
        "short_title": str(row['short_title']) if pd.notna(row['short_title']) else "",
        "version_date": datetime.strptime(row['version_date'], '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') if pd.notna(row['version_date']) else None,
        "last_change": datetime.strptime(row['last_change'], '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') if pd.notna(row['last_change']) else None
    }
    
    try:
        # Make the API call
        response = requests.post(API_ENDPOINT, headers=headers, json=payload, timeout=30)
        
        # Check if the request was successful
        if response.status_code == 200:
            result = response.json()
            return {
                "row_id": row['id'],
                "identifier": row['identifier'],
                "success": True,
                "api_response": result,
                "status_code": response.status_code
            }
        else:
            return {
                "row_id": row['id'],
                "identifier": row['identifier'],
                "success": False,
                "error": f"HTTP {response.status_code}: {response.text}",
                "status_code": response.status_code
            }
            
    except requests.exceptions.RequestException as e:
        return {
            "row_id": row['id'],
            "identifier": row['identifier'],
            "success": False,
            "error": f"Request failed: {str(e)}",
            "status_code": None
        }
    except Exception as e:
        return {
            "row_id": row['id'],
            "identifier": row['identifier'],
            "success": False,
            "error": f"Unexpected error: {str(e)}",
            "status_code": None
        }

# Main processing loop
def process_dataframe(df, delay_between_requests=1):
    """
    Process all rows in the dataframe by calling the API for each row.
    
    Args:
        df: pandas DataFrame with the required columns
        delay_between_requests: delay in seconds between API calls to avoid rate limiting
        
    Returns:
        list: List of results for each row
    """
    results = []
    total_rows = len(df)
    
    print(f"Starting to process {total_rows} rows...")
    print(f"API endpoint: {API_ENDPOINT}")
    print(f"Delay between requests: {delay_between_requests} seconds")
    print("-" * 50)
    
    for index, row in df.iterrows():
        print(f"Processing row {index + 1}/{total_rows}: ID={row['id']}, Identifier={row['identifier']}")
        
        # Call the API for this row
        result = call_check_standard_version_api(row)
        results.append(result)
        
        # Print the result
        if result['success']:
            print(f"  ✅ Success - Has newer version:{result['api_response'].get('data', {}).get('has_newer_version', 'N/A')}")
        else:
            print(f"  ❌ Failed: {result['error']}")
        
        # Add delay between requests (except for the last one)
        if index < total_rows - 1:
            time.sleep(delay_between_requests)
    
    print("-" * 50)
    print(f"Processing complete! Processed {len(results)} rows.")
    
    return results

# Function to analyze results
def analyze_results(results):
    """
    Analyze the results and provide a summary.
    
    Args:
        results: List of result dictionaries from process_dataframe
        
    Returns:
        dict: Summary statistics
    """
    total_requests = len(results)
    successful_requests = sum(1 for r in results if r['success'])
    failed_requests = total_requests - successful_requests
    
    # Count newer versions found
    newer_versions_found = 0
    for result in results:
        if result['success'] and result['api_response'].get('data', {}).get('has_newer_version', False):
            newer_versions_found += 1
    
    # Group errors by type
    error_counts = {}
    for result in results:
        if not result['success']:
            error_type = result['error'].split(':')[0] if ':' in result['error'] else result['error']
            error_counts[error_type] = error_counts.get(error_type, 0) + 1
    
    summary = {
        "total_requests": total_requests,
        "successful_requests": successful_requests,
        "failed_requests": failed_requests,
        "success_rate": (successful_requests / total_requests * 100) if total_requests > 0 else 0,
        "newer_versions_found": newer_versions_found,
        "error_breakdown": error_counts
    }
    
    return summary

# Function to add API results to dataframe
def add_api_results_to_dataframe(df, results):
    """
    Add API response data as new columns to the dataframe.
    
    Args:
        df: pandas DataFrame with original data
        results: List of result dictionaries from process_dataframe
        
    Returns:
        pandas DataFrame: Original dataframe with new columns added
    """
    # Create a copy of the original dataframe
    df_with_results = df.copy()
    
    # Initialize new columns with default values
    df_with_results['success'] = False
    df_with_results['has_newer_version'] = None
    df_with_results['current_version_date'] = None
    df_with_results['change_note'] = None
    df_with_results['effective_date'] = None
    df_with_results['link'] = None
    df_with_results['retracted'] = None
    
    # Map results to dataframe rows by ID
    for result in results:
        row_id = result['row_id']
        # Find the row in dataframe with matching ID
        mask = df_with_results['id'] == row_id
        
        if mask.any():
            # Update the row with API results
            df_with_results.loc[mask, 'success'] = result['success']
            
            if result['success'] and 'api_response' in result:
                api_data = result['api_response'].get('data', {})
                
                # Extract values from API response
                df_with_results.loc[mask, 'has_newer_version'] = api_data.get('has_newer_version', None)
                df_with_results.loc[mask, 'current_version_date'] = api_data.get('current_version_date', None)
                df_with_results.loc[mask, 'retracted'] = api_data.get('retracted', None)
                # Extract from changes array - take the first entry
                changes = api_data.get('changes', [])
                if changes and len(changes) > 0:
                    first_change = changes[0]
                    df_with_results.loc[mask, 'change_note'] = first_change.get('change_note', None)
                    df_with_results.loc[mask, 'effective_date'] = first_change.get('effective_date', None)
                    df_with_results.loc[mask, 'link'] = first_change.get('link', None)
    
    return df_with_results

# Execute the processing
if __name__ == "__main__":
    # Make sure your dataframe is loaded and has the required columns
    print("DataFrame info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print()
    
    # Check if required columns exist
    required_columns = ['id', 'type', 'identifier', 'short_title', 'version_date', 'last_change']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"❌ Missing required columns: {missing_columns}")
        print("Please ensure your DataFrame has all required columns.")
    else:
        print("✅ All required columns found!")
        
        # Process the dataframe
        df_subset = df.iloc[:20] #.iloc[4:5] #df.iloc[25:] #df.head(10)
        results = process_dataframe(df_subset, delay_between_requests=2)  # 2 second delay between requests
        
        # Analyze results
        summary = analyze_results(results)
        
        # Print summary
        print("\n" + "="*50)
        print("SUMMARY")
        print("="*50)
        print(f"Total requests: {summary['total_requests']}")
        print(f"Successful: {summary['successful_requests']}")
        print(f"Failed: {summary['failed_requests']}")
        print(f"Success rate: {summary['success_rate']:.1f}%")
        print(f"Newer versions found: {summary['newer_versions_found']}")
        
        if summary['error_breakdown']:
            print("\nError breakdown:")
            for error_type, count in summary['error_breakdown'].items():
                print(f"  {error_type}: {count}")
        
        # Add API results to dataframe
        df_with_results = add_api_results_to_dataframe(df_subset, results)
        
        # Save results to CSV file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = f"api_results_{timestamp}.csv"
        
        df_with_results.to_csv(csv_filename, index=False, encoding='utf-8')
        
        print(f"\nResults saved to: {csv_filename}")
        print(f"DataFrame shape after adding API results: {df_with_results.shape}")
        print(f"New columns added: {list(df_with_results.columns)}")