### Build  a  Python  application  that  reads  a  dataset  (CSV)  and  stores  it  in  multiple  data structures  (lists,  tuples,  sets,  dictionaries).  Demonstrate  efficient  retrieval,  filtering, and aggregation using each structure. Compare their performance. 

In [1]:
import csv
import io
import timeit
from typing import List, Dict, Tuple, Set, Any

In [2]:
def load_data(csv_string: str) -> List[Dict[str, Any]]:
    """Loads CSV data from a string into a list of dictionaries."""
    # Use io.StringIO to treat the string as a file-like object
    reader = csv.DictReader(io.StringIO(csv_string))
    records = []
    for row in reader:
        # Convert necessary fields to their correct types
        row['CourseID'] = int(row['CourseID'])
        row['DurationDays'] = int(row['DurationDays'])
        row['Price'] = float(row['Price'])
        row['Enrollments'] = int(row['Enrollments'])
        records.append(row)
    return records

In [5]:

# --- 1. Define Mock CSV Data ---
CSV_DATA = """CourseID,CourseName,Category,DurationDays,Price,Enrollments
1001,Intro to Python,Programming,30,99.99,5500
1002,Advanced CSS,Design,15,49.00,3200
1003,Social Media Strategy,Marketing,45,199.50,1800
1004,Data Science Fundamentals,Programming,60,299.00,8100
1005,Logo Design Masterclass,Design,20,75.50,4500
1006,Email Marketing Basics,Marketing,10,25.00,2900
1007,Web Development with Flask,Programming,40,149.99,6300
1008,UX/UI Principles,Design,35,120.00,3900
"""

# --- 2. Load and Parse Data ---

def load_data(csv_string: str) -> List[Dict[str, Any]]:
    """Loads CSV data from a string into a list of dictionaries."""
    # Use io.StringIO to treat the string as a file-like object
    reader = csv.DictReader(io.StringIO(csv_string))
    records = []
    for row in reader:
        # Convert necessary fields to their correct types
        row['CourseID'] = int(row['CourseID'])
        row['DurationDays'] = int(row['DurationDays'])
        row['Price'] = float(row['Price'])
        row['Enrollments'] = int(row['Enrollments'])
        records.append(row)
    return records

# --- 3. Structure Conversion ---

def convert_to_structures(records: List[Dict[str, Any]]) -> Tuple[List, Tuple, Set, Dict]:
    """Converts the list of dictionaries into four different data structures."""

    # 1. List of Dictionaries (Flexible, O(N) lookup)
    data_list = records

    # 2. Tuple of Tuples (Immutable, O(N) lookup)
    # We store the field values as tuples (CourseID, Name, Category, Duration, Price, Enrollments)
    # This requires a known column order for access.
    data_tuple = tuple(
        (r['CourseID'], r['CourseName'], r['Category'], r['DurationDays'], r['Price'], r['Enrollments'])
        for r in records
    )

    # 3. Set of Tuples (Fast existence check, O(1), but cumbersome for filtering fields)
    data_set = set(data_tuple)

    # 4. Dictionary (Fast retrieval by key, O(1))
    # Key = CourseID, Value = The rest of the record (as a dictionary)
    data_dict = {
        r['CourseID']: {k: v for k, v in r.items() if k != 'CourseID'}
        for r in records
    }

    return data_list, data_tuple, data_set, data_dict

# --- 4. Define Demonstration Functions ---

# --- Retrieval: Find record by CourseID (1004) ---

def retrieve_list(data: List[Dict[str, Any]], course_id: int):
    """Retrieval: Linear search through a list of dictionaries."""
    return next((record for record in data if record['CourseID'] == course_id), None)

def retrieve_tuple(data: Tuple[Tuple], course_id: int):
    """Retrieval: Linear search through a tuple of tuples (CourseID is at index 0)."""
    return next((record for record in data if record[0] == course_id), None)

def retrieve_set(data: Set[Tuple], course_id: int):
    """Retrieval: A set must be converted or iterated to check a field. Not suitable for field retrieval."""
    # Since sets are optimized for *membership* of the whole element, we simulate a 'search' by iterating.
    # Performance will be similar to list/tuple for field search.
    # To check for the *entire record* existence: `(1004, 'Data Science Fundamentals', ...) in data` (O(1))
    return next(((record, True) for record in data if record[0] == course_id), (None, False))

def retrieve_dict(data: Dict[int, Dict[str, Any]], course_id: int):
    """Retrieval: Direct key access."""
    return data.get(course_id)

# --- Filtering: Find all courses in 'Programming' category ---

def filter_list(data: List[Dict[str, Any]], category: str):
    """Filtering: List comprehension on a list of dictionaries."""
    return [record for record in data if record['Category'] == category]

def filter_tuple(data: Tuple[Tuple], category: str):
    """Filtering: List comprehension on a tuple of tuples (Category is at index 2)."""
    return [record for record in data if record[2] == category]

def filter_set(data: Set[Tuple], category: str):
    """Filtering: Must iterate and check the required index (index 2)."""
    return [record for record in data if record[2] == category]

def filter_dict(data: Dict[int, Dict[str, Any]], category: str):
    """Filtering: Iterate over dictionary values."""
    return [record for record in data.values() if record['Category'] == category]

# --- Aggregation: Sum of all Enrollments ---

def aggregate_list(data: List[Dict[str, Any]]):
    """Aggregation: Summing a specific field in a list of dictionaries."""
    return sum(record['Enrollments'] for record in data)

def aggregate_tuple(data: Tuple[Tuple]):
    """Aggregation: Summing a specific field in a tuple of tuples (Enrollments is at index 5)."""
    return sum(record[5] for record in data)

def aggregate_set(data: Set[Tuple]):
    """Aggregation: Summing a specific field in a set of tuples (Enrollments is at index 5)."""
    return sum(record[5] for record in data)

def aggregate_dict(data: Dict[int, Dict[str, Any]]):
    """Aggregation: Summing a specific field in dictionary values."""
    return sum(record['Enrollments'] for record in data.values())

# --- 5. Performance Comparison ---

def measure_performance(func, args, number=1000):
    """Measures the execution time of a function."""
    # Use timeit to accurately measure the time for 'number' executions
    return timeit.timeit(lambda: func(*args), number=number)

def run_analysis():
    """Main function to run the data loading, operations, and performance analysis."""

    print("--- 1. Data Loading and Structure Initialization ---")
    records = load_data(CSV_DATA)
    data_list, data_tuple, data_set, data_dict = convert_to_structures(records)
    print(f"Total Records Loaded: {len(records)}")
    print(f"List of Dictionaries (Example): {data_list[0]['CourseName']}")
    print(f"Dictionary (Example): {data_dict[1001]['Category']}\n")

    # --- 2. Demonstrations (using CourseID 1004 and Category 'Programming') ---

    # Retrieval
    print("--- 2.1 Retrieval: Find CourseID 1004 ---")
    print(f"List Result:   {retrieve_list(data_list, 1004)['CourseName']}")
    print(f"Tuple Result:  {retrieve_tuple(data_tuple, 1004)[1]}")
    # Note on Set: The set search returns the whole tuple, which is then handled locally.
    set_record, found = retrieve_set(data_set, 1004)
    print(f"Set Result:    {'Found' if found else 'Not Found'}")
    print(f"Dict Result:   {retrieve_dict(data_dict, 1004)['CourseName']}\n")

    # Filtering
    print("--- 2.2 Filtering: Courses in 'Programming' Category ---")
    print(f"List Found:    {len(filter_list(data_list, 'Programming'))} courses")
    print(f"Tuple Found:   {len(filter_tuple(data_tuple, 'Programming'))} courses")
    print(f"Set Found:     {len(filter_set(data_set, 'Programming'))} courses")
    print(f"Dict Found:    {len(filter_dict(data_dict, 'Programming'))} courses\n")

    # Aggregation
    print("--- 2.3 Aggregation: Total Enrollments ---")
    print(f"List Total:    {aggregate_list(data_list)}")
    print(f"Tuple Total:   {aggregate_tuple(data_tuple)}")
    print(f"Set Total:     {aggregate_set(data_set)}")
    print(f"Dict Total:    {aggregate_dict(data_dict)}\n")

    # --- 3. Performance Analysis (10,000 runs each) ---

    N_RUNS = 10000
    print(f"--- 3. Performance Comparison (Time in seconds over {N_RUNS:,} runs) ---")
    print(f"Data Set Size: {len(records)} records.")
    print("-" * 75)
    print(f"{'Structure':<15} | {'Retrieval (1004)':<20} | {'Filtering (Prog)':<20} | {'Aggregation (Sum)':<15}")
    print("-" * 75)

    # Retrieval Time
    time_list_ret = measure_performance(retrieve_list, (data_list, 1004), N_RUNS)
    time_tuple_ret = measure_performance(retrieve_tuple, (data_tuple, 1004), N_RUNS)
    time_set_ret = measure_performance(retrieve_set, (data_set, 1004), N_RUNS)  # O(N) since we search a field
    time_dict_ret = measure_performance(retrieve_dict, (data_dict, 1004), N_RUNS)  # O(1)

    # Filtering Time
    time_list_filt = measure_performance(filter_list, (data_list, 'Programming'), N_RUNS)  # O(N)
    time_tuple_filt = measure_performance(filter_tuple, (data_tuple, 'Programming'), N_RUNS)  # O(N)
    time_set_filt = measure_performance(filter_set, (data_set, 'Programming'), N_RUNS)  # O(N)
    time_dict_filt = measure_performance(filter_dict, (data_dict, 'Programming'), N_RUNS)  # O(N)

    # Aggregation Time
    time_list_agg = measure_performance(aggregate_list, (data_list,), N_RUNS)  # O(N)
    time_tuple_agg = measure_performance(aggregate_tuple, (data_tuple,), N_RUNS)  # O(N)
    time_set_agg = measure_performance(aggregate_set, (data_set,), N_RUNS)  # O(N)
    time_dict_agg = measure_performance(aggregate_dict, (data_dict,), N_RUNS)  # O(N)

    print(f"{'List (Dicts)':<15} | {time_list_ret:<20.6f} | {time_list_filt:<20.6f} | {time_list_agg:<15.6f}")
    print(f"{'Tuple (Tuples)':<15} | {time_tuple_ret:<20.6f} | {time_tuple_filt:<20.6f} | {time_tuple_agg:<15.6f}")
    print(f"{'Set (Tuples)':<15} | {time_set_ret:<20.6f} | {time_set_filt:<20.6f} | {time_set_agg:<15.6f}")
    print(f"{'Dictionary (Key)':<15} | {time_dict_ret:<20.6f} | {time_dict_filt:<20.6f} | {time_dict_agg:<15.6f}")
    print("-" * 75)

    print("\n--- Summary of Performance ---")
    print("1. Retrieval (by ID): **Dictionary** is drastically faster (O(1)) because it uses a hash map for direct key access. The others require a linear search (O(N)).")
    print("2. Filtering & Aggregation: Performance is very similar (O(N)) across all structures, as they all require iterating over every record to check a field or sum a value.")
    print("3. Set Limitation: While sets offer O(1) for *membership* (checking if an exact record exists), they are less efficient than lists/tuples for field-based retrieval and aggregation.")


if __name__ == "__main__":
    run_analysis()

--- 1. Data Loading and Structure Initialization ---
Total Records Loaded: 8
List of Dictionaries (Example): Intro to Python
Dictionary (Example): Programming

--- 2.1 Retrieval: Find CourseID 1004 ---
List Result:   Data Science Fundamentals
Tuple Result:  Data Science Fundamentals
Set Result:    Found
Dict Result:   Data Science Fundamentals

--- 2.2 Filtering: Courses in 'Programming' Category ---
List Found:    3 courses
Tuple Found:   3 courses
Set Found:     3 courses
Dict Found:    3 courses

--- 2.3 Aggregation: Total Enrollments ---
List Total:    36200
Tuple Total:   36200
Set Total:     36200
Dict Total:    36200

--- 3. Performance Comparison (Time in seconds over 10,000 runs) ---
Data Set Size: 8 records.
---------------------------------------------------------------------------
Structure       | Retrieval (1004)     | Filtering (Prog)     | Aggregation (Sum)
---------------------------------------------------------------------------
List (Dicts)    | 0.006977            