In [1]:
import os
import json
import random
from datetime import datetime, timedelta

# Configuration
NUM_FILES = 5000
CITIES = [f"City_{i}" for i in range(100, 200)]
NULL_PROBABILITY = 0.005  # Adjust to between 0.5% - 0.1%
FILE_SIZE_RANGE = (50, 100)
OUTPUT_DIR = "C:/Users/rajashekarm/Downloads/Life Science/Personal_Medicine"


In [2]:
def generate_flight_record():
    # Randomly assign some values
    record = {
        "date": (datetime.now() + timedelta(days=random.randint(-365, 365))).isoformat(),
        "origin_city": random.choice(CITIES),
        "destination_city": random.choice(CITIES),
        "flight_duration_secs": random.randint(3600, 7200),  # 1 to 2 hours
        "num_passengers": random.randint(50, 300)
    }
    # Introduce NULLs in some records
    if random.random() < NULL_PROBABILITY:
        key = random.choice(list(record.keys()))
        record[key] = None
    return record

def generate_files():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    for _ in range(NUM_FILES):
        num_records = random.randint(*FILE_SIZE_RANGE)
        origin_city = random.choice(CITIES)
        date_stamp = datetime.now().strftime("%m-%y")
        filename = f"{OUTPUT_DIR}/{date_stamp}-{origin_city}-flights.json"
        
        flights = [generate_flight_record() for _ in range(num_records)]
        
        with open(filename, "w") as file:
            json.dump(flights, file)

# Run the generator
generate_files()


In [3]:
import time
import glob
from collections import defaultdict
import numpy as np

In [6]:
import os

def process_files():
    start_time = time.time()
    total_records, dirty_records = 0, 0
    durations_by_city = defaultdict(list)
    passengers_in = defaultdict(int)
    passengers_out = defaultdict(int)

    # Processing files
    for filepath in glob.glob(f"{OUTPUT_DIR}/*.json"):
        with open(filepath, "r") as file:
            flights = json.load(file)
            total_records += len(flights)
            
            for flight in flights:
                # Count dirty records
                if None in flight.values():
                    dirty_records += 1

                # Flight duration analysis
                if flight["destination_city"] and flight["flight_duration_secs"]:
                    durations_by_city[flight["destination_city"]].append(flight["flight_duration_secs"])
                
                # Track passengers
                if flight["origin_city"] and flight["num_passengers"]:
                    passengers_out[flight["origin_city"]] += flight["num_passengers"]
                if flight["destination_city"] and flight["num_passengers"]:
                    passengers_in[flight["destination_city"]] += flight["num_passengers"]

    # Calculate flight duration stats for top 25 cities by number of records
    top_25_cities = sorted(durations_by_city, key=lambda k: len(durations_by_city[k]), reverse=True)[:25]
    duration_stats = {
        city: {
            "avg_duration": np.mean(durations_by_city[city]),
            "p95_duration": np.percentile(durations_by_city[city], 95)
        }
        for city in top_25_cities
    }

    # Find cities with max passengers arrived and departed
    max_in_city = max(passengers_in, key=passengers_in.get)
    max_out_city = max(passengers_out, key=passengers_out.get)

    end_time = time.time()
    run_duration = end_time - start_time

    # DataFrames for each category of results
    summary_df = pd.DataFrame({
        "total_records": [total_records],
        "dirty_records": [dirty_records],
        "run_duration": [run_duration]
    })

    duration_stats_df = pd.DataFrame.from_dict(duration_stats, orient="index")
    duration_stats_df.index.name = "destination_city"
    duration_stats_df.reset_index(inplace=True)

    passengers_in_df = pd.DataFrame.from_dict(passengers_in, orient="index", columns=["passengers_in"])
    passengers_in_df.index.name = "city"
    passengers_in_df.reset_index(inplace=True)

    passengers_out_df = pd.DataFrame.from_dict(passengers_out, orient="index", columns=["passengers_out"])
    passengers_out_df.index.name = "city"
    passengers_out_df.reset_index(inplace=True)

    max_passenger_df = pd.DataFrame({
        "city": [max_in_city, max_out_city],
        "type": ["max_in_city", "max_out_city"],
        "passengers": [passengers_in[max_in_city], passengers_out[max_out_city]]
    })

    # Ensure output directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Save each DataFrame to CSV files
    summary_df.to_csv(os.path.join(OUTPUT_DIR, "summary.csv"), index=False)
    duration_stats_df.to_csv(os.path.join(OUTPUT_DIR, "duration_stats.csv"), index=False)
    passengers_in_df.to_csv(os.path.join(OUTPUT_DIR, "passengers_in.csv"), index=False)
    passengers_out_df.to_csv(os.path.join(OUTPUT_DIR, "passengers_out.csv"), index=False)
    max_passenger_df.to_csv(os.path.join(OUTPUT_DIR, "max_passenger_summary.csv"), index=False)

    # Return all DataFrames (optional)
    return {
        "summary": summary_df,
        "duration_stats": duration_stats_df,
        "passengers_in": passengers_in_df,
        "passengers_out": passengers_out_df,
        "max_passenger_summary": max_passenger_df
    }

# Run processing and save results
results = process_files()
