In [18]:
import requests
import json
import csv
import os
from datetime import datetime, timedelta
from itertools import combinations
import random
import time
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('flight_scraper.log'),
        logging.StreamHandler()
    ]
)

# Define your routes
top_400_routes = [
    ("DEL", "BOM"), ("DEL", "BLR"), ("DEL", "HYD"), ("DEL", "MAA"), ("DEL", "CCU"),
    ("BOM", "BLR"), ("BOM", "HYD"), ("BOM", "MAA"), ("BOM", "CCU"), ("BOM", "PNQ"),
    ("BLR", "HYD"), ("BLR", "MAA"), ("BLR", "CCU"), ("BLR", "AMD"), ("BLR", "COK"),
    ("HYD", "MAA"), ("HYD", "CCU"), ("HYD", "PNQ"), ("HYD", "JAI"), ("HYD", "IXC"),
    ("MAA", "CCU"), ("MAA", "COK"), ("MAA", "IXM"), ("MAA", "TRV"), ("MAA", "BOM"),
    ("CCU", "GAU"), ("CCU", "IXB"), ("CCU", "PAT"), ("CCU", "DEL"), ("CCU", "BLR"),
    ("PNQ", "DEL"), ("PNQ", "BLR"), ("PNQ", "HYD"), ("PNQ", "MAA"), ("PNQ", "AMD"),
    ("AMD", "DEL"), ("AMD", "BOM"), ("AMD", "BLR"), ("AMD", "HYD"), ("AMD", "MAA"),
    ("COK", "BLR"), ("COK", "MAA"), ("COK", "DEL"), ("COK", "HYD"), ("COK", "BOM"),
    ("GOI", "DEL"), ("GOI", "BOM"), ("GOI", "BLR"), ("GOI", "HYD"), ("GOI", "PNQ"),
    ("JAI", "DEL"), ("JAI", "BOM"), ("JAI", "HYD"), ("JAI", "AMD"), ("JAI", "BLR"),
    ("LKO", "DEL"), ("LKO", "BOM"), ("LKO", "BLR"), ("LKO", "HYD"), ("LKO", "CCU"),
    ("IXC", "DEL"), ("IXC", "BOM"), ("IXC", "BLR"), ("IXC", "HYD"), ("IXC", "PNQ"),
    ("TRV", "DEL"), ("TRV", "BLR"), ("TRV", "MAA"), ("TRV", "HYD"), ("TRV", "COK"),
    ("IXB", "DEL"), ("IXB", "CCU"), ("IXB", "BLR"), ("IXB", "GAU"), ("IXB", "HYD"),
    ("GAU", "DEL"), ("GAU", "CCU"), ("GAU", "BLR"), ("GAU", "HYD"), ("GAU", "IXB"),
    ("PAT", "DEL"), ("PAT", "CCU"), ("PAT", "HYD"), ("PAT", "BLR"), ("PAT", "BOM"),
    ("NAG", "DEL"), ("NAG", "BOM"), ("NAG", "HYD"), ("NAG", "BLR"), ("NAG", "MAA"),
    ("VGA", "DEL"), ("VGA", "HYD"), ("VGA", "BLR"), ("VGA", "MAA"), ("VGA", "BOM"),
    ("RPR", "DEL"), ("RPR", "BOM"), ("RPR", "BLR"), ("RPR", "HYD"), ("RPR", "MAA"),
    ("BHO", "DEL"), ("BHO", "BOM"), ("BHO", "BLR"), ("BHO", "HYD"), ("BHO", "PNQ"),
    ("SXR", "DEL"), ("SXR", "IXC"), ("SXR", "BOM"), ("SXR", "BLR"), ("SXR", "HYD"),
    ("IXZ", "MAA"), ("IXZ", "CCU"), ("IXZ", "DEL"), ("IXZ", "BLR"), ("IXZ", "BOM"),
]

# Convert to set and add more routes
route_set = set(top_400_routes)
core_airports = [
    "DEL", "BOM", "BLR", "HYD", "MAA", "CCU", "PNQ", "AMD", "COK", "JAI",
    "LKO", "IXC", "TRV", "IXB", "GAU", "PAT", "NAG", "RPR", "BHO", "SXR", "IXZ", "VGA"
]

all_possible_routes = list(combinations(core_airports, 2))
random.shuffle(all_possible_routes)

for route in all_possible_routes:
    if len(route_set) >= 400:
        break
    if route not in route_set:
        route_set.add(route)

# Final list of routes
routes = list(route_set)

# FIXED DEPARTURE DATE - August 31, 2025
FIXED_DEPARTURE_DATE = datetime(2025, 8, 31)

def fetch_flight_data(origin, destination, departure_date):
    """Fetch flight data for a specific route and date"""
    url = "http://www.nomadiq.co.in/api/search"
    headers = {
        "Accept": "application/json, text/plain, */*",
        "Content-Type": "application/json"
    }
    
    payload = {
        "UserIp": "127.0.0.1",
        "Adult": 1,
        "Child": 0,
        "Infant": 0,
        "JourneyType": 1,
        "CabinClass": 1,
        "AirSegments": [
            {
                "Origin": origin,
                "Destination": destination,
                "PreferredTime": departure_date.strftime("%Y-%m-%dT00:00:00")
            }
        ],
        "DirectFlight": True,
        "PreferredCarriers": []
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed for {origin}-{destination}: {e}")
        return None

def parse_flight_data(data, origin, destination, search_date, departure_date):
    """Parse flight data and extract relevant information"""
    flights = []
    
    if not data or 'Result' not in data or not data['Result']:
        return flights
    
    # Calculate days before departure
    days_before_departure = (departure_date.date() - search_date.date()).days
    
    try:
        for flight_group in data['Result']:
            for flight in flight_group:
                # Extract segment information
                if 'Segments' not in flight or not flight['Segments']:
                    continue
                
                segment = flight['Segments'][0][0]  # First segment (direct flight)
                
                # Basic flight information
                flight_info = {
                    'search_date': search_date.strftime("%Y-%m-%d"),
                    'departure_date': departure_date.strftime("%Y-%m-%d"),
                    'days_before_departure': days_before_departure,
                    'origin': origin,
                    'destination': destination,
                    'departure_time': segment['Origin']['DepartTime'],
                    'arrival_time': segment['Destination']['ArrivalTime'],
                    'duration_minutes': segment['Duration'],
                    'airline_code': segment['Airline']['AirlineCode'],
                    'airline_name': segment['Airline']['AirlineName'],
                    'flight_number': segment['Airline']['FlightNumber'],
                    'aircraft': segment['Craft'],
                    'origin_terminal': segment['Origin'].get('Terminal', ''),
                    'destination_terminal': segment['Destination'].get('Terminal', ''),
                }
                
                # Extract fare information
                if 'FareList' in flight and flight['FareList']:
                    # Get minimum and maximum prices
                    prices = [fare['PublishedPrice'] for fare in flight['FareList']]
                    flight_info['min_price'] = min(prices)
                    flight_info['max_price'] = max(prices)
                    flight_info['min_published_price'] = flight.get('MinPublishedPrice', min(prices))
                    
                    # Get details from the cheapest fare
                    cheapest_fare = min(flight['FareList'], key=lambda x: x['PublishedPrice'])
                    flight_info['cabin_class'] = cheapest_fare['CabinClass']
                    flight_info['is_refundable'] = cheapest_fare['IsRefundable']
                    flight_info['fare_type'] = cheapest_fare['FareType']
                    
                    # Baggage information
                    if cheapest_fare['SeatBaggage']:
                        baggage = cheapest_fare['SeatBaggage'][0][0]
                        flight_info['checkin_baggage'] = baggage.get('CheckIn', '')
                        flight_info['cabin_baggage'] = baggage.get('Cabin', '')
                        flight_info['seats_available'] = baggage.get('NoOfSeatAvailable', '')
                else:
                    # Set default values if no fare information
                    flight_info.update({
                        'min_price': None,
                        'max_price': None,
                        'min_published_price': None,
                        'cabin_class': '',
                        'is_refundable': None,
                        'fare_type': '',
                        'checkin_baggage': '',
                        'cabin_baggage': '',
                        'seats_available': None
                    })
                
                flights.append(flight_info)
                
    except Exception as e:
        logging.error(f"Error parsing flight data for {origin}-{destination}: {e}")
    
    return flights

def save_to_csv(flights, filename='flight_non_connecting_india2.csv'):
    """Save flight data to CSV file"""
    if not flights:
        return
    
    # Check if file exists to determine if we need to write headers
    file_exists = os.path.isfile(filename)
    
    # Define CSV columns
    fieldnames = [
        'search_date', 'departure_date', 'days_before_departure', 'origin', 'destination', 
        'departure_time', 'arrival_time', 'duration_minutes', 'airline_code', 'airline_name', 
        'flight_number', 'aircraft', 'origin_terminal', 'destination_terminal', 'min_price',
        'max_price', 'min_published_price', 'cabin_class', 'is_refundable',
        'fare_type', 'checkin_baggage', 'cabin_baggage', 'seats_available'
    ]
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header only if file is new
        if not file_exists:
            writer.writeheader()
        
        # Write flight data
        writer.writerows(flights)

def main():
    """Main function to fetch flight data for all routes"""
    # Current date (when the search is performed)
    search_date = datetime.now()
    
    # Calculate days until departure
    days_until_departure = (FIXED_DEPARTURE_DATE.date() - search_date.date()).days
    
    # Check if departure date has passed
    if days_until_departure < 0:
        logging.error("Departure date has already passed! Please update FIXED_DEPARTURE_DATE.")
        return
    
    logging.info(f"Starting flight data collection")
    logging.info(f"Search date: {search_date.strftime('%Y-%m-%d')}")
    logging.info(f"Fixed departure date: {FIXED_DEPARTURE_DATE.strftime('%Y-%m-%d')}")
    logging.info(f"Days before departure: {days_until_departure}")
    logging.info(f"Total routes to process: {len(routes)}")
    
    all_flights = []
    successful_routes = 0
    failed_routes = 0
    
    for i, (origin, destination) in enumerate(routes, 1):
        logging.info(f"Processing route {i}/{len(routes)}: {origin} -> {destination}")
        
        # Fetch data for the fixed departure date
        data = fetch_flight_data(origin, destination, FIXED_DEPARTURE_DATE)
        
        if data:
            # Parse and store flight information
            flights = parse_flight_data(data, origin, destination, search_date, FIXED_DEPARTURE_DATE)
            if flights:
                all_flights.extend(flights)
                successful_routes += 1
                logging.info(f"Found {len(flights)} flights for {origin}-{destination}")
            else:
                logging.warning(f"No flights found for {origin}-{destination}")
                failed_routes += 1
        else:
            failed_routes += 1
        
        # Add delay to avoid overwhelming the API
        time.sleep(30)  # Adjust delay as needed
        
        # Save data periodically (every 10 routes)
        if i % 10 == 0 and all_flights:
            save_to_csv(all_flights)
            logging.info(f"Saved {len(all_flights)} flights to CSV")
            all_flights = []  # Clear the list after saving
    
    # Save any remaining flights
    if all_flights:
        save_to_csv(all_flights)
        logging.info(f"Saved final {len(all_flights)} flights to CSV")
    
    logging.info(f"Data collection complete!")
    logging.info(f"Successful routes: {successful_routes}")
    logging.info(f"Failed routes: {failed_routes}")
    
    # Save tracking information
    with open('collection_tracking.txt', 'a') as f:
        f.write(f"{search_date.strftime('%Y-%m-%d')}: Collected data {days_until_departure} days before departure\n")

In [19]:
main()

2025-07-19 16:47:12,426 - INFO - Starting flight data collection
2025-07-19 16:47:12,430 - INFO - Search date: 2025-07-19
2025-07-19 16:47:12,432 - INFO - Fixed departure date: 2025-08-31
2025-07-19 16:47:12,432 - INFO - Days before departure: 43
2025-07-19 16:47:12,433 - INFO - Total routes to process: 318
2025-07-19 16:47:12,435 - INFO - Processing route 1/318: JAI -> IXC
2025-07-19 16:47:18,174 - INFO - Found 2 flights for JAI-IXC
2025-07-19 16:47:48,205 - INFO - Processing route 2/318: IXB -> DEL
2025-07-19 16:47:50,754 - INFO - Found 8 flights for IXB-DEL
2025-07-19 16:48:20,785 - INFO - Processing route 3/318: IXC -> BOM
2025-07-19 16:48:53,681 - INFO - Processing route 4/318: BOM -> PAT
2025-07-19 16:48:56,051 - INFO - Found 4 flights for BOM-PAT
2025-07-19 16:49:26,083 - INFO - Processing route 5/318: CCU -> SXR
2025-07-19 16:49:57,453 - INFO - Processing route 6/318: LKO -> BOM
2025-07-19 16:49:59,576 - INFO - Found 9 flights for LKO-BOM
2025-07-19 16:50:29,607 - INFO - Proces