In [None]:
import requests
import json
import csv
import os
from datetime import datetime, timedelta
import time
import logging
from typing import List, Tuple, Dict, Optional

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('international_flight_scraper.log'),
        logging.StreamHandler()
    ]
)

# Top 100 India–Foreign international routes
top_100_india_foreign_routes = [
    ("DEL", "DXB"), ("DEL", "DOH"), ("DEL", "SIN"), ("DEL", "LHR"), ("DEL", "KUL"),
    ("DEL", "BKK"), ("DEL", "AUH"), ("DEL", "JFK"), ("DEL", "CDG"), ("DEL", "FRA"),
    ("BOM", "DXB"), ("BOM", "DOH"), ("BOM", "SIN"), ("BOM", "LHR"), ("BOM", "KUL"),
    ("BOM", "BKK"), ("BOM", "AUH"), ("BOM", "JFK"), ("BOM", "CDG"), ("BOM", "FRA"),
    ("BLR", "DXB"), ("BLR", "DOH"), ("BLR", "SIN"), ("BLR", "LHR"), ("BLR", "KUL"),
    ("BLR", "BKK"), ("BLR", "AUH"), ("BLR", "JFK"), ("BLR", "CDG"), ("BLR", "FRA"),
    ("HYD", "DXB"), ("HYD", "DOH"), ("HYD", "SIN"), ("HYD", "KUL"), ("HYD", "AUH"),
    ("HYD", "BKK"), ("HYD", "LHR"), ("HYD", "JFK"), ("HYD", "CDG"), ("HYD", "FRA"),
    ("MAA", "DXB"), ("MAA", "DOH"), ("MAA", "SIN"), ("MAA", "KUL"), ("MAA", "AUH"),
    ("MAA", "BKK"), ("MAA", "LHR"), ("MAA", "JFK"), ("MAA", "CDG"), ("MAA", "FRA"),
    ("CCU", "DXB"), ("CCU", "DOH"), ("CCU", "SIN"), ("CCU", "KUL"), ("CCU", "BKK"),
    ("CCU", "AUH"), ("CCU", "LHR"), ("CCU", "JFK"), ("CCU", "CDG"), ("CCU", "FRA"),
    ("COK", "DXB"), ("COK", "DOH"), ("COK", "SIN"), ("COK", "KUL"), ("COK", "BKK"),
    ("COK", "AUH"), ("COK", "LHR"), ("COK", "JFK"), ("COK", "CDG"), ("COK", "FRA"),
    ("AMD", "DXB"), ("AMD", "DOH"), ("AMD", "AUH"), ("AMD", "SIN"), ("AMD", "BKK"),
    ("AMD", "KUL"), ("AMD", "LHR"), ("AMD", "JFK"), ("AMD", "FRA"), ("AMD", "CDG"),
    ("TRV", "DXB"), ("TRV", "DOH"), ("TRV", "AUH"), ("TRV", "SIN"), ("TRV", "KUL"),
    ("TRV", "BKK"), ("TRV", "LHR"), ("TRV", "JFK"), ("TRV", "FRA"), ("TRV", "CDG"),
    ("GOI", "DXB"), ("GOI", "DOH"), ("GOI", "AUH"), ("GOI", "KUL"), ("GOI", "SIN"),
    ("GOI", "BKK"), ("GOI", "FRA"), ("GOI", "LHR"), ("GOI", "CDG"), ("GOI", "JFK")
]

# Top 50 Foreign–Foreign busiest international routes
top_50_foreign_routes = [
    ("LHR", "JFK"), ("LAX", "JFK"), ("HND", "SIN"), ("CDG", "JFK"), ("DXB", "LHR"),
    ("LAX", "LHR"), ("HND", "LHR"), ("SYD", "SIN"), ("CDG", "DXB"), ("SIN", "DXB"),
    ("FRA", "JFK"), ("LHR", "FRA"), ("HND", "LAX"), ("JFK", "GRU"), ("SFO", "LHR"),
    ("LHR", "YVR"), ("DXB", "FRA"), ("LHR", "CDG"), ("SIN", "SYD"), ("LHR", "MAD"),
    ("FRA", "SFO"), ("JFK", "NRT"), ("LAX", "NRT"), ("CDG", "SFO"), ("LAX", "SYD"),
    ("LHR", "HKG"), ("LHR", "ICN"), ("FRA", "ICN"), ("DOH", "LHR"), ("DXB", "ICN"),
    ("JFK", "MEX"), ("CDG", "FRA"), ("IST", "LHR"), ("LHR", "AMS"), ("DOH", "CDG"),
    ("JFK", "AMS"), ("SFO", "ICN"), ("SIN", "ICN"), ("DXB", "JNB"), ("JFK", "ZRH"),
    ("SYD", "LAX"), ("JFK", "HND"), ("LHR", "SIN"), ("FRA", "SIN"), ("IST", "FRA"),
    ("BKK", "HKG"), ("HND", "ICN"), ("SIN", "HKG"), ("DXB", "KUL"), ("LHR", "BOM")
]

# Combine all routes
all_international_routes = top_100_india_foreign_routes + top_50_foreign_routes

# CONFIGURATION
FIXED_DEPARTURE_DATE = datetime(2025, 8, 31)  # Fixed departure date
API_URL = ""
CSV_FILENAME = "flight_international_routes.csv"
FAILED_ROUTES_LOG = "international_failed_routes_log.json"

# Airport information for better logging
INDIAN_AIRPORTS = {
    "DEL", "BOM", "BLR", "HYD", "MAA", "CCU", "COK", "AMD", "TRV", "GOI"
}

MAJOR_INTERNATIONAL_HUBS = {
    "DXB", "DOH", "SIN", "LHR", "JFK", "LAX", "CDG", "FRA", "AUH", "BKK", 
    "KUL", "HND", "SYD", "ICN", "HKG", "IST", "AMS", "NRT", "SFO", "YVR"
}

class InternationalFlightDataCollector:
    def __init__(self):
        self.failed_routes = {}
        self.session = requests.Session()
        self.session.headers.update({
            "Accept": "application/json, text/plain, */*",
            "Content-Type": "application/json"
        })
    
    def get_route_type(self, origin: str, destination: str) -> str:
        """Determine if route is India-Foreign, Foreign-India, or Foreign-Foreign"""
        if origin in INDIAN_AIRPORTS and destination not in INDIAN_AIRPORTS:
            return "India-Foreign"
        elif origin not in INDIAN_AIRPORTS and destination in INDIAN_AIRPORTS:
            return "Foreign-India"
        else:
            return "Foreign-Foreign"
    
    def fetch_flight_data(self, origin: str, destination: str, departure_date: datetime, 
                         max_retries: int = 3) -> Optional[Dict]:
        """Fetch flight data with retry logic"""
        route_type = self.get_route_type(origin, destination)
        
        payload = {
            "UserIp": "127.0.0.1",
            "Adult": 1,
            "Child": 0,
            "Infant": 0,
            "JourneyType": 1,
            "CabinClass": 1,
            "AirSegments": [
                {
                    "Origin": origin,
                    "Destination": destination,
                    "PreferredTime": departure_date.strftime("%Y-%m-%dT00:00:00")
                }
            ],
            "DirectFlight": False,  # Set to False for international routes as they often have connections
            "PreferredCarriers": []
        }
        
        for attempt in range(max_retries):
            try:
                response = self.session.post(API_URL, json=payload, timeout=45)  # Increased timeout for international
                
                if response.status_code == 500:
                    if attempt < max_retries - 1:
                        wait_time = 5 * (attempt + 1)
                        logging.warning(f"Server error for {origin}-{destination} ({route_type}), "
                                      f"attempt {attempt+1}/{max_retries}. Waiting {wait_time}s...")
                        time.sleep(wait_time)
                        continue
                    else:
                        self.track_failed_route(origin, destination, "500 Server Error", route_type)
                        return None
                
                response.raise_for_status()
                data = response.json()
                
                # Check for API-level errors
                if 'Error' in data and data['Error']['ErrorCode'] != 0:
                    self.track_failed_route(origin, destination, 
                                          f"API Error: {data['Error']['ErrorMessage']}", route_type)
                    return None
                
                return data
                
            except requests.exceptions.Timeout:
                logging.error(f"Timeout for {origin}-{destination} ({route_type}), attempt {attempt+1}")
                if attempt < max_retries - 1:
                    time.sleep(5)
                else:
                    self.track_failed_route(origin, destination, "Timeout", route_type)
                    
            except requests.exceptions.RequestException as e:
                logging.error(f"Request error for {origin}-{destination} ({route_type}): {e}")
                self.track_failed_route(origin, destination, str(e), route_type)
                return None
                
            except Exception as e:
                logging.error(f"Unexpected error for {origin}-{destination} ({route_type}): {e}")
                self.track_failed_route(origin, destination, f"Unexpected: {str(e)}", route_type)
                return None
        
        return None
    
    def track_failed_route(self, origin: str, destination: str, reason: str, route_type: str):
        """Track failed routes for analysis"""
        route = f"{origin}-{destination}"
        if route not in self.failed_routes:
            self.failed_routes[route] = []
        
        self.failed_routes[route].append({
            'timestamp': datetime.now().isoformat(),
            'reason': reason,
            'route_type': route_type
        })
        
        # Save to file
        with open(FAILED_ROUTES_LOG, 'w') as f:
            json.dump(self.failed_routes, f, indent=2)
    
    def parse_flight_data(self, data: Dict, origin: str, destination: str, 
                         search_date: datetime, departure_date: datetime) -> List[Dict]:
        """Parse flight data and extract relevant information"""
        flights = []
        
        if not data or 'Result' not in data or not data['Result']:
            return flights
        
        days_before_departure = (departure_date.date() - search_date.date()).days
        route_type = self.get_route_type(origin, destination)
        
        try:
            for flight_group in data['Result']:
                if not isinstance(flight_group, list):
                    continue
                    
                for flight in flight_group:
                    if 'Segments' not in flight or not flight['Segments']:
                        continue
                    
                    # For international flights, handle multiple segments
                    segments = flight['Segments'][0]  # Get all segments for this flight option
                    num_stops = len(segments) - 1
                    
                    # Get first and last segment for overall journey info
                    first_segment = segments[0]
                    last_segment = segments[-1]
                    
                    # Calculate total duration
                    total_duration = sum(seg.get('Duration', 0) for seg in segments)
                    total_duration += sum(seg.get('LayoverTime', 0) for seg in segments[:-1])
                    
                    # Extract flight information
                    flight_info = {
                        'search_date': search_date.strftime("%Y-%m-%d"),
                        'search_time': search_date.strftime("%H:%M:%S"),
                        'departure_date': departure_date.strftime("%Y-%m-%d"),
                        'days_before_departure': days_before_departure,
                        'route_type': route_type,
                        'origin': origin,
                        'destination': destination,
                        'departure_time': first_segment['Origin']['DepartTime'],
                        'arrival_time': last_segment['Destination']['ArrivalTime'],
                        'duration_minutes': total_duration,
                        'num_stops': num_stops,
                        'is_direct': num_stops == 0,
                        'airline_code': first_segment['Airline']['AirlineCode'],
                        'airline_name': first_segment['Airline']['AirlineName'],
                        'flight_numbers': '/'.join([seg['Airline']['FlightNumber'] 
                                                   for seg in segments]),
                        'aircraft': '/'.join([seg.get('Craft', '') for seg in segments]),
                        'origin_terminal': first_segment['Origin'].get('Terminal', ''),
                        'destination_terminal': last_segment['Destination'].get('Terminal', ''),
                    }
                    
                    # Add connection information if not direct
                    if num_stops > 0:
                        connections = []
                        for i in range(num_stops):
                            connections.append(segments[i]['Destination']['AirportCode'])
                        flight_info['connections'] = ','.join(connections)
                    else:
                        flight_info['connections'] = ''
                    
                    # Extract price information
                    if 'FareList' in flight and flight['FareList']:
                        prices = []
                        for fare in flight['FareList']:
                            if 'PublishedPrice' in fare:
                                prices.append(fare['PublishedPrice'])
                        
                        if prices:
                            flight_info['min_price'] = min(prices)
                            flight_info['max_price'] = max(prices)
                            flight_info['price_range'] = max(prices) - min(prices)
                            flight_info['num_fare_options'] = len(prices)
                        
                        # Get cheapest fare details
                        cheapest_fare = min(flight['FareList'], 
                                          key=lambda x: x.get('PublishedPrice', float('inf')))
                        
                        flight_info['cabin_class'] = cheapest_fare.get('CabinClass', '')
                        flight_info['is_refundable'] = cheapest_fare.get('IsRefundable', None)
                        flight_info['fare_type'] = cheapest_fare.get('FareType', '')
                        
                        # Baggage info
                        if cheapest_fare.get('SeatBaggage'):
                            try:
                                baggage = cheapest_fare['SeatBaggage'][0][0]
                                flight_info['checkin_baggage'] = baggage.get('CheckIn', '')
                                flight_info['cabin_baggage'] = baggage.get('Cabin', '')
                                flight_info['seats_available'] = baggage.get('NoOfSeatAvailable', '')
                            except (IndexError, TypeError):
                                pass
                    
                    # Set defaults for missing values
                    for key in ['min_price', 'max_price', 'price_range', 'num_fare_options',
                               'cabin_class', 'is_refundable', 'fare_type', 
                               'checkin_baggage', 'cabin_baggage', 'seats_available']:
                        if key not in flight_info:
                            flight_info[key] = None
                    
                    flights.append(flight_info)
                    
        except Exception as e:
            logging.error(f"Error parsing flight data for {origin}-{destination}: {e}")
        
        return flights
    
    def save_to_csv(self, flights: List[Dict]):
        """Save flight data to CSV"""
        if not flights:
            return
        
        file_exists = os.path.isfile(CSV_FILENAME)
        
        fieldnames = [
            'search_date', 'search_time', 'departure_date', 'days_before_departure',
            'route_type', 'origin', 'destination', 'departure_time', 'arrival_time', 
            'duration_minutes', 'num_stops', 'is_direct', 'connections',
            'airline_code', 'airline_name', 'flight_numbers', 'aircraft',
            'origin_terminal', 'destination_terminal', 'min_price', 'max_price',
            'price_range', 'num_fare_options', 'cabin_class', 'is_refundable',
            'fare_type', 'checkin_baggage', 'cabin_baggage', 'seats_available'
        ]
        
        with open(CSV_FILENAME, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            if not file_exists:
                writer.writeheader()
            
            writer.writerows(flights)
    
    def collect_data(self):
        """Main collection function"""
        search_date = datetime.now()
        days_until_departure = (FIXED_DEPARTURE_DATE.date() - search_date.date()).days
        
        if days_until_departure < 0:
            logging.error("Departure date has passed! Update FIXED_DEPARTURE_DATE.")
            return
        
        logging.info(f"Starting international flight data collection")
        logging.info(f"Search date: {search_date.strftime('%Y-%m-%d %H:%M:%S')}")
        logging.info(f"Fixed departure date: {FIXED_DEPARTURE_DATE.strftime('%Y-%m-%d')}")
        logging.info(f"Days before departure: {days_until_departure}")
        logging.info(f"Total routes to process: {len(all_international_routes)}")
        logging.info(f"India-Foreign routes: {len(top_100_india_foreign_routes)}")
        logging.info(f"Foreign-Foreign routes: {len(top_50_foreign_routes)}")
        
        all_flights = []
        successful_routes = 0
        failed_routes = 0
        no_flights_routes = 0
        route_type_stats = {"India-Foreign": 0, "Foreign-India": 0, "Foreign-Foreign": 0}
        
        for i, (origin, destination) in enumerate(all_international_routes, 1):
            route_type = self.get_route_type(origin, destination)
            logging.info(f"Processing route {i}/{len(all_international_routes)}: "
                        f"{origin} -> {destination} ({route_type})")
            
            # Fetch data
            data = self.fetch_flight_data(origin, destination, FIXED_DEPARTURE_DATE)
            
            if data:
                flights = self.parse_flight_data(data, origin, destination, 
                                               search_date, FIXED_DEPARTURE_DATE)
                if flights:
                    all_flights.extend(flights)
                    successful_routes += 1
                    route_type_stats[route_type] += 1
                    logging.info(f"✓ Found {len(flights)} flight options for {origin}-{destination}")
                else:
                    no_flights_routes += 1
                    logging.warning(f"✗ No flights found for {origin}-{destination}")
                    self.track_failed_route(origin, destination, "No flights in response", route_type)
            else:
                failed_routes += 1
                logging.error(f"✗ Failed to fetch data for {origin}-{destination}")
            
            # Longer delay for international routes
            if failed_routes > successful_routes * 0.4:  # If >40% failing
                time.sleep(5)
            else:
                time.sleep(2)
            
            # Save periodically
            if i % 10 == 0 and all_flights:
                self.save_to_csv(all_flights)
                logging.info(f"Saved {len(all_flights)} flights to CSV")
                all_flights = []
        
        # Save remaining flights
        if all_flights:
            self.save_to_csv(all_flights)
            logging.info(f"Saved final {len(all_flights)} flights to CSV")
        
        # Final summary
        logging.info(f"\n{'='*70}")
        logging.info(f"International flight data collection complete!")
        logging.info(f"Successful routes: {successful_routes}")
        logging.info(f"  - India-Foreign: {route_type_stats['India-Foreign']}")
        logging.info(f"  - Foreign-India: {route_type_stats['Foreign-India']}")
        logging.info(f"  - Foreign-Foreign: {route_type_stats['Foreign-Foreign']}")
        logging.info(f"Routes with no flights: {no_flights_routes}")
        logging.info(f"Failed routes: {failed_routes}")
        logging.info(f"Total routes processed: {len(all_international_routes)}")
        logging.info(f"Success rate: {(successful_routes/len(all_international_routes)*100):.1f}%")
        logging.info(f"{'='*70}\n")
        
        # Save summary
        with open('international_collection_summary.txt', 'a') as f:
            f.write(f"\n{search_date.strftime('%Y-%m-%d %H:%M:%S')} - "
                   f"Days before departure: {days_until_departure}\n")
            f.write(f"Successful: {successful_routes} "
                   f"(I-F: {route_type_stats['India-Foreign']}, "
                   f"F-I: {route_type_stats['Foreign-India']}, "
                   f"F-F: {route_type_stats['Foreign-Foreign']}), "
                   f"No flights: {no_flights_routes}, "
                   f"Failed: {failed_routes}, Total: {len(all_international_routes)}\n")

def main():
    collector = InternationalFlightDataCollector()
    collector.collect_data()

if __name__ == "__main__":
    main()
