# Task - 1
Please find below the client task and revert us back with the solution earliest possible by 5th Aug.

 

Task# 1

Write a python3.7+ program which would run locally on the laptop and perform the following: 

#1 Generation phase:

Program should generate N=~5000 JSON files on disk in /tmp/flights/%MM-YY%-%origin_city%-flights.json or similar folder structure where each file is a JSON array of random size M = [50 – 100] of randomly generated flights data between cities. Total set of cities is K=[100-200]. Flight record is an object containing  {date, origin_city, destination_city, flight_duration_secs, # of passengers of board}. Some records, with probability L=[0.5%-0.1%] should have NULL in any of the flight record properties.

In [1]:
import os
import json
import random
from datetime import datetime, timedelta

In [2]:
# Constants
N = 5000
M_RANGE = (50, 100)
K = random.randint(100, 200)
L_PROBABILITY = (0.005, 0.001)
CITIES = [f"City_{i}" for i in range(K)]
BASE_DIR = "/tmp/flights"
DATE_RANGE = (datetime(2020, 1, 1), datetime(2024, 12, 31))

In [3]:
K

191

In [4]:
# Ensure  base directory exists
if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)

In [5]:
# function to generate a random date
def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

In [6]:
# Function to generate a random flight record
def generate_flight_record():
    date = random_date(*DATE_RANGE).strftime("%Y-%m-%d %H:%M:%S")
    origin_city = random.choice(CITIES)
    destination_city = random.choice(CITIES)
    flight_duration_secs = random.randint(3600, 7200)
    passengers_on_board = random.randint(50, 300)

    record = {
        "date": date,
        "origin_city": origin_city,
        "destination_city": destination_city,
        "flight_duration_secs": flight_duration_secs,
        "passengers_on_board": passengers_on_board
    }
        # Randomly assign NULL to some properties based on probability L
    for key in record:
        if random.random() < random.uniform(*L_PROBABILITY):
            record[key] = None

    return record


In [7]:
# Generate the JSON files
for i in range(N):
    # Generate a random number of flights for the file
    M = random.randint(*M_RANGE)
    flights = [generate_flight_record() for _ in range(M)]

    # Create the file path
    month_year = datetime.now().strftime("%m-%Y")
    origin_city = random.choice(CITIES)
    file_dir = os.path.join(BASE_DIR, f"{month_year}-{origin_city}-flights.json")

    # Ensure the directory exists
    if not os.path.exists(os.path.dirname(file_dir)):
        os.makedirs(os.path.dirname(file_dir))

    # Write the flights data to the file
    with open(file_dir, "w") as f:
        json.dump(flights, f, indent=4)

    print(f"Generated {file_dir}")

print("Finished generating flight data.")

Generated /tmp/flights\08-2024-City_186-flights.json
Generated /tmp/flights\08-2024-City_89-flights.json
Generated /tmp/flights\08-2024-City_125-flights.json
Generated /tmp/flights\08-2024-City_99-flights.json
Generated /tmp/flights\08-2024-City_78-flights.json
Generated /tmp/flights\08-2024-City_161-flights.json
Generated /tmp/flights\08-2024-City_10-flights.json
Generated /tmp/flights\08-2024-City_115-flights.json
Generated /tmp/flights\08-2024-City_53-flights.json
Generated /tmp/flights\08-2024-City_167-flights.json
Generated /tmp/flights\08-2024-City_163-flights.json
Generated /tmp/flights\08-2024-City_47-flights.json
Generated /tmp/flights\08-2024-City_151-flights.json
Generated /tmp/flights\08-2024-City_165-flights.json
Generated /tmp/flights\08-2024-City_187-flights.json
Generated /tmp/flights\08-2024-City_143-flights.json
Generated /tmp/flights\08-2024-City_104-flights.json
Generated /tmp/flights\08-2024-City_88-flights.json
Generated /tmp/flights\08-2024-City_4-flights.json
Ge

#2 Analysis & Cleaning phase:

Program should process those files in the most optimal way and produce the following result:

- #count of total records processed, #count of dirty records and total run duration.

- AVG and P95 (95th percentile) of flight duration for Top 25 destination cities.

- Assuming cities had originally 0 passengers, find two cities with MAX passengers arrived and left.

import os
import json
import time
import numpy as np
from collections import defaultdict

In [9]:
import os
import json
import time
import numpy as np
from collections import defaultdict

In [10]:
# Constants
BASE_DIR = "/tmp/flights"

In [11]:
# Initialize counters and data structures
total_records = 0
dirty_records = 0
flight_durations = defaultdict(list)
passenger_counts = defaultdict(int)
passengers_arrived = defaultdict(int)
passengers_left = defaultdict(int)

In [12]:
# Start timing
start_time = time.time()

In [13]:
# function to process a single flight record
def process_flight_record(record):
    global total_records, dirty_records

    total_records += 1

    # Check for dirty records (i.e., records with None values)
    if any(value is None for value in record.values()):
        dirty_records += 1
        return

    # Update flight duration data
    flight_durations[record["destination_city"]].append(record["flight_duration_secs"])

    # Update passenger counts
    passengers_arrived[record["destination_city"]] += record["passengers_on_board"]
    passengers_left[record["origin_city"]] += record["passengers_on_board"]


In [14]:
# Process all JSON files in the directory
for root, dirs, files in os.walk(BASE_DIR):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)
            with open(file_path, "r") as f:
                flights = json.load(f)
                for flight in flights:
                    process_flight_record(flight)

In [15]:
# Calculate total run duration
end_time = time.time()
run_duration = end_time - start_time

In [16]:
# Calculate AVG and P95 flight duration for top 25 destination cities
top_25_cities = sorted(flight_durations.keys(), key=lambda x: len(flight_durations[x]), reverse=True)[:25]
avg_flight_durations = {city: np.mean(durations) for city, durations in flight_durations.items() if city in top_25_cities}
p95_flight_durations = {city: np.percentile(durations, 95) for city, durations in flight_durations.items() if city in top_25_cities}


In [17]:
# Find cities with max passengers arrived and left
max_passengers_arrived = max(passengers_arrived, key=passengers_arrived.get)
max_passengers_left = max(passengers_left, key=passengers_left.get)

In [18]:
# Print results
print(f"Total records processed: {total_records}")
print(f"Dirty records count: {dirty_records}")
print(f"Total run duration: {run_duration:.2f} seconds")

Total records processed: 14628
Dirty records count: 216
Total run duration: 20.27 seconds


In [19]:
print("\nAVG and P95 flight duration for Top 25 destination cities:")


AVG and P95 flight duration for Top 25 destination cities:


In [20]:
for city in top_25_cities:
    print(f"{city}: AVG={avg_flight_durations[city]:.2f} secs, P95={p95_flight_durations[city]:.2f} secs")


City_171: AVG=5368.36 secs, P95=6985.20 secs
City_0: AVG=5444.92 secs, P95=7030.50 secs
City_170: AVG=5478.70 secs, P95=7087.50 secs
City_153: AVG=5446.10 secs, P95=7002.25 secs
City_43: AVG=5375.05 secs, P95=7067.10 secs
City_156: AVG=5538.43 secs, P95=7012.35 secs
City_58: AVG=5264.85 secs, P95=6918.30 secs
City_10: AVG=5358.09 secs, P95=6995.45 secs
City_77: AVG=5319.17 secs, P95=6810.40 secs
City_65: AVG=5366.43 secs, P95=7102.40 secs
City_140: AVG=5719.99 secs, P95=7101.70 secs
City_123: AVG=5258.31 secs, P95=7049.50 secs
City_182: AVG=5409.24 secs, P95=6992.20 secs
City_118: AVG=5480.01 secs, P95=7015.40 secs
City_148: AVG=5440.73 secs, P95=7076.75 secs
City_157: AVG=5309.58 secs, P95=6779.75 secs
City_46: AVG=5210.08 secs, P95=6884.50 secs
City_41: AVG=5385.60 secs, P95=6965.25 secs
City_21: AVG=5203.56 secs, P95=6763.50 secs
City_187: AVG=5312.75 secs, P95=6786.60 secs
City_126: AVG=5531.55 secs, P95=7004.00 secs
City_30: AVG=5459.06 secs, P95=6982.80 secs
City_122: AVG=5466.08

In [21]:
print(f"\nCity with maximum passengers arrived: {max_passengers_arrived} ({passengers_arrived[max_passengers_arrived]} passengers)")
print(f"City with maximum passengers left: {max_passengers_left} ({passengers_left[max_passengers_left]} passengers)")


City with maximum passengers arrived: City_0 (16350 passengers)
City with maximum passengers left: City_28 (17855 passengers)
