In [None]:
import requests
import json
import csv
import os
from datetime import datetime, timedelta
from itertools import combinations
import random
import time
import logging
from typing import List, Tuple, Dict, Optional

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('flight_scraper_daily.log'),
        logging.StreamHandler()
    ]
)

# Define your routes (keeping existing routes)
top_400_routes = [
    ("DEL", "BOM"), ("DEL", "BLR"), ("DEL", "HYD"), ("DEL", "MAA"), ("DEL", "CCU"),
    ("BOM", "BLR"), ("BOM", "HYD"), ("BOM", "MAA"), ("BOM", "CCU"), ("BOM", "PNQ"),
    # ... rest of your routes ...
]

# Convert to set and add more routes
route_set = set(top_400_routes)
core_airports = [
    "DEL", "BOM", "BLR", "HYD", "MAA", "CCU", "PNQ", "AMD", "COK", "JAI",
    "LKO", "IXC", "TRV", "IXB", "GAU", "PAT", "NAG", "RPR", "BHO", "SXR", "IXZ", "VGA"
]

all_possible_routes = list(combinations(core_airports, 2))
random.shuffle(all_possible_routes)

for route in all_possible_routes:
    if len(route_set) >= 400:
        break
    if route not in route_set:
        route_set.add(route)

# Final list of routes
routes = list(route_set)

# CONFIGURATION
DAYS_TO_COLLECT = 30  # Collect data for next 30 days
API_URL = "http://www.nomadiq.co.in/api/search"
CSV_FILENAME = "flight_prices_daily.csv"  # Changed filename
FAILED_ROUTES_LOG = "failed_routes_daily_log.json"

class FlightDataCollector:
    def __init__(self):
        self.failed_routes = {}
        self.session = requests.Session()
        self.session.headers.update({
            "Accept": "application/json, text/plain, */*",
            "Content-Type": "application/json"
        })
    
    def fetch_flight_data(self, origin: str, destination: str, departure_date: datetime, 
                         max_retries: int = 3) -> Optional[Dict]:
        """Fetch flight data with retry logic"""
        # ... keeping the existing fetch_flight_data method as is ...
        payload = {
            "UserIp": "127.0.0.1",
            "Adult": 1,
            "Child": 0,
            "Infant": 0,
            "JourneyType": 1,
            "CabinClass": 1,
            "AirSegments": [
                {
                    "Origin": origin,
                    "Destination": destination,
                    "PreferredTime": departure_date.strftime("%Y-%m-%dT00:00:00")
                }
            ],
            "DirectFlight": False,
            "PreferredCarriers": []
        }
        
        for attempt in range(max_retries):
            try:
                response = self.session.post(API_URL, json=payload, timeout=30)
                
                if response.status_code == 500:
                    if attempt < max_retries - 1:
                        wait_time = 5 * (attempt + 1)
                        logging.warning(f"Server error for {origin}-{destination}, "
                                      f"attempt {attempt+1}/{max_retries}. Waiting {wait_time}s...")
                        time.sleep(wait_time)
                        continue
                    else:
                        self.track_failed_route(origin, destination, "500 Server Error", departure_date)
                        return None
                
                response.raise_for_status()
                data = response.json()
                
                if 'Error' in data and data['Error']['ErrorCode'] != 0:
                    self.track_failed_route(origin, destination, f"API Error: {data['Error']['ErrorMessage']}", departure_date)
                    return None
                
                return data
                
            except requests.exceptions.Timeout:
                logging.error(f"Timeout for {origin}-{destination}, attempt {attempt+1}")
                if attempt < max_retries - 1:
                    time.sleep(3)
                else:
                    self.track_failed_route(origin, destination, "Timeout", departure_date)
                    
            except requests.exceptions.RequestException as e:
                logging.error(f"Request error for {origin}-{destination}: {e}")
                self.track_failed_route(origin, destination, str(e), departure_date)
                return None
                
            except Exception as e:
                logging.error(f"Unexpected error for {origin}-{destination}: {e}")
                self.track_failed_route(origin, destination, f"Unexpected: {str(e)}", departure_date)
                return None
        
        return None
    
    def track_failed_route(self, origin: str, destination: str, reason: str, departure_date: datetime):
        """Track failed routes for analysis"""
        route = f"{origin}-{destination}-{departure_date.strftime('%Y-%m-%d')}"
        if route not in self.failed_routes:
            self.failed_routes[route] = []
        
        self.failed_routes[route].append({
            'timestamp': datetime.now().isoformat(),
            'reason': reason
        })
        
        with open(FAILED_ROUTES_LOG, 'w') as f:
            json.dump(self.failed_routes, f, indent=2)
    
    def parse_flight_data(self, data: Dict, origin: str, destination: str, 
                         search_date: datetime, departure_date: datetime) -> List[Dict]:
        """Parse flight data and extract relevant information"""
        flights = []
        
        if not data or 'Result' not in data or not data['Result']:
            return flights
        
        days_before_departure = (departure_date.date() - search_date.date()).days
        
        try:
            for flight_group in data['Result']:
                if not isinstance(flight_group, list):
                    continue
                    
                for flight in flight_group:
                    if 'Segments' not in flight or not flight['Segments']:
                        continue
                    
                    segments = flight['Segments'][0]
                    num_stops = len(segments) - 1
                    
                    first_segment = segments[0]
                    last_segment = segments[-1]
                    
                    total_duration = 0
                    for seg in segments:
                        total_duration += seg.get('Duration', 0)
                    for i in range(len(segments) - 1):
                        total_duration += segments[i].get('LayoverTime', 0)
                    
                    connecting_airports = []
                    if num_stops > 0:
                        for i in range(num_stops):
                            connecting_airports.append(segments[i]['Destination']['AirportCode'])
                    
                    flight_numbers = []
                    airline_codes = []
                    for seg in segments:
                        flight_numbers.append(str(seg['Airline']['FlightNumber']))
                        airline_codes.append(seg['Airline']['AirlineCode'])
                    
                    # Basic flight information
                    flight_info = {
                        'search_date': search_date.strftime("%Y-%m-%d"),
                        'search_time': search_date.strftime("%H:%M:%S"),
                        'departure_date': departure_date.strftime("%Y-%m-%d"),
                        'days_before_departure': days_before_departure,
                        'origin': origin,
                        'destination': destination,
                        'departure_time': first_segment['Origin']['DepartTime'],
                        'arrival_time': last_segment['Destination']['ArrivalTime'],
                        'duration_minutes': total_duration,
                        'stops': num_stops,
                        'is_direct': num_stops == 0,
                        'connecting_airports': ','.join(connecting_airports) if connecting_airports else '',
                        'airline_codes': '/'.join(set(airline_codes)),
                        'airline_name': first_segment['Airline']['AirlineName'],
                        'flight_numbers': '/'.join(flight_numbers),
                        'aircraft': '/'.join([seg.get('Craft', '') for seg in segments]),
                        'origin_terminal': first_segment['Origin'].get('Terminal', ''),
                        'destination_terminal': last_segment['Destination'].get('Terminal', ''),
                    }
                    
                    # Extract price information
                    if 'FareList' in flight and flight['FareList']:
                        prices = []
                        for fare in flight['FareList']:
                            if 'PublishedPrice' in fare:
                                prices.append(fare['PublishedPrice'])
                        
                        if prices:
                            flight_info['min_price'] = min(prices)
                            flight_info['max_price'] = max(prices)
                            flight_info['price_range'] = max(prices) - min(prices)
                            flight_info['num_fare_options'] = len(prices)
                        
                        cheapest_fare = min(flight['FareList'], 
                                          key=lambda x: x.get('PublishedPrice', float('inf')))
                        
                        flight_info['cabin_class'] = cheapest_fare.get('CabinClass', '')
                        flight_info['is_refundable'] = cheapest_fare.get('IsRefundable', None)
                        flight_info['fare_type'] = cheapest_fare.get('FareType', '')
                        
                        if cheapest_fare.get('SeatBaggage'):
                            try:
                                baggage = cheapest_fare['SeatBaggage'][0][0]
                                flight_info['checkin_baggage'] = baggage.get('CheckIn', '')
                                flight_info['cabin_baggage'] = baggage.get('Cabin', '')
                                flight_info['seats_available'] = baggage.get('NoOfSeatAvailable', '')
                            except (IndexError, TypeError):
                                pass
                    
                    for key in ['min_price', 'max_price', 'price_range', 'num_fare_options',
                               'cabin_class', 'is_refundable', 'fare_type', 
                               'checkin_baggage', 'cabin_baggage', 'seats_available']:
                        if key not in flight_info:
                            flight_info[key] = None
                    
                    flights.append(flight_info)
                    
        except Exception as e:
            logging.error(f"Error parsing flight data for {origin}-{destination}: {e}")
        
        return flights
    
    def save_to_csv(self, flights: List[Dict]):
        """Save flight data to CSV"""
        if not flights:
            return
        
        file_exists = os.path.isfile(CSV_FILENAME)
        
        fieldnames = [
            'search_date', 'search_time', 'departure_date', 'days_before_departure',
            'origin', 'destination', 'departure_time', 'arrival_time', 'duration_minutes',
            'stops', 'is_direct', 'connecting_airports', 'airline_codes', 'airline_name', 
            'flight_numbers', 'aircraft', 'origin_terminal', 'destination_terminal', 
            'min_price', 'max_price', 'price_range', 'num_fare_options', 'cabin_class', 
            'is_refundable', 'fare_type', 'checkin_baggage', 'cabin_baggage', 'seats_available'
        ]
        
        with open(CSV_FILENAME, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            if not file_exists:
                writer.writeheader()
            
            writer.writerows(flights)
    
    def collect_data(self):
        """Main collection function - now collects for next 30 days"""
        search_date = datetime.now()
        tomorrow = search_date.date() + timedelta(days=1)
        
        logging.info(f"Starting daily flight data collection")
        logging.info(f"Collection timestamp: {search_date.strftime('%Y-%m-%d %H:%M:%S')}")
        logging.info(f"Collecting data for departures from {tomorrow} to {tomorrow + timedelta(days=DAYS_TO_COLLECT-1)}")
        logging.info(f"Total routes to process: {len(routes)}")
        logging.info(f"Total departure dates: {DAYS_TO_COLLECT}")
        logging.info(f"Total API calls: {len(routes) * DAYS_TO_COLLECT}")
        
        all_flights = []
        total_successful = 0
        total_failed = 0
        total_no_flights = 0
        
        # Collect data for each departure date
        for day_offset in range(1, DAYS_TO_COLLECT + 1):
            departure_date = datetime.combine(
                search_date.date() + timedelta(days=day_offset),
                datetime.min.time()
            )
            
            logging.info(f"\n{'='*60}")
            logging.info(f"Collecting for departure date: {departure_date.strftime('%Y-%m-%d')} "
                       f"(Day {day_offset}/{DAYS_TO_COLLECT})")
            logging.info(f"Days before departure: {day_offset}")
            
            day_flights = 0
            day_successful = 0
            day_failed = 0
            
            for i, (origin, destination) in enumerate(routes, 1):
                if i % 50 == 0:
                    logging.info(f"Progress: Route {i}/{len(routes)} for {departure_date.strftime('%Y-%m-%d')}")
                
                # Fetch data
                data = self.fetch_flight_data(origin, destination, departure_date)
                
                if data:
                    flights = self.parse_flight_data(data, origin, destination, 
                                                   search_date, departure_date)
                    if flights:
                        all_flights.extend(flights)
                        day_flights += len(flights)
                        day_successful += 1
                        total_successful += 1
                    else:
                        total_no_flights += 1
                        self.track_failed_route(origin, destination, "No flights in response", departure_date)
                else:
                    day_failed += 1
                    total_failed += 1
                
                # Rate limiting
                time.sleep(1)
                
                # Save periodically to avoid memory issues
                if len(all_flights) >= 1000:
                    self.save_to_csv(all_flights)
                    logging.info(f"Saved {len(all_flights)} flights to CSV")
                    all_flights = []
            
            logging.info(f"Day {departure_date.strftime('%Y-%m-%d')} summary: "
                       f"{day_flights} flights from {day_successful} routes")
        
        # Save remaining flights
        if all_flights:
            self.save_to_csv(all_flights)
            logging.info(f"Saved final {len(all_flights)} flights to CSV")
        
        # Final summary
        logging.info(f"\n{'='*60}")
        logging.info(f"Daily collection complete!")
        logging.info(f"Collection date: {search_date.strftime('%Y-%m-%d %H:%M:%S')}")
        logging.info(f"Total successful API calls: {total_successful}")
        logging.info(f"Total failed API calls: {total_failed}")
        logging.info(f"Total routes with no flights: {total_no_flights}")
        logging.info(f"Total API calls made: {len(routes) * DAYS_TO_COLLECT}")
        logging.info(f"Success rate: {(total_successful/(len(routes)*DAYS_TO_COLLECT)*100):.1f}%")
        logging.info(f"{'='*60}\n")
        
        # Save summary
        with open('daily_collection_summary.txt', 'a') as f:
            f.write(f"\nCollection Date: {search_date.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Departure dates: {tomorrow} to {tomorrow + timedelta(days=DAYS_TO_COLLECT-1)}\n")
            f.write(f"Successful: {total_successful}, Failed: {total_failed}, "
                   f"No flights: {total_no_flights}, Total calls: {len(routes) * DAYS_TO_COLLECT}\n")
            f.write(f"Success rate: {(total_successful/(len(routes)*DAYS_TO_COLLECT)*100):.1f}%\n")
            f.write("-" * 80 + "\n")

def main():
    collector = FlightDataCollector()
    collector.collect_data()

if __name__ == "__main__":
    main()
