In [1]:
# Import the NumPy library
import numpy as np
import numpy.ma as ma # Import the masked array module

# --- 1. Structured Arrays ---
# Arrays whose elements are like C-structs or database records,
# containing fields with different names and potentially different data types.

print("--- Structured Arrays ---")

# Define the structure (dtype) with field names and their types
# Format: [(field_name, type), (field_name, type), ...]
# Common type codes: 'i4' (int32), 'f8' (float64), 'U10' (unicode string length 10)
structured_dtype = np.dtype([('name', 'U10'), ('age', 'i4'), ('height_m', 'f8')])
print(f"Structured dtype definition:\n{structured_dtype}")

# Create data matching the structure
data = [('Alice', 30, 1.65),
        ('Bob', 25, 1.80),
        ('Charlie', 35, 1.75)]

# Create the structured array
structured_arr = np.array(data, dtype=structured_dtype)
print(f"\nStructured array:\n{structured_arr}")
print(f"Shape: {structured_arr.shape}") # Output: (3,) - It's a 1D array of structures

# Accessing data:
# Access a specific record (row)
print(f"\nFirst record: {structured_arr[0]}")

# Access a specific field (column) by name
print(f"\nNames: {structured_arr['name']}") # Returns a standard NumPy array of names
print(f"Ages: {structured_arr['age']}")   # Returns a standard NumPy array of ages

# Access a specific field of a specific record
print(f"\nAge of Bob: {structured_arr[1]['age']}")
print(f"Height of Alice: {structured_arr['height_m'][0]}") # Access field first, then index

# Structured arrays are useful for representing heterogeneous data in a fixed format.
print("-" * 30)


# --- 2. Masked Arrays (np.ma) ---
# Arrays designed to handle missing or invalid data points.
# They consist of the data array and a boolean mask array of the same shape.
# Where the mask is True, the corresponding data element is considered invalid or missing.

print("--- Masked Arrays ---")
# Create an array with some potentially invalid data (e.g., -1 for missing age)
data_with_missing = np.array([1, 2, -1, 4, 5, -1, 7])
print(f"Original data with -1 for missing: {data_with_missing}")

# Create a masked array where -1 is masked
masked_arr = ma.masked_values(data_with_missing, value=-1)
print(f"\nMasked array (where -1 is masked):\n{masked_arr}")
print(f"Data part: {masked_arr.data}")
print(f"Mask part: {masked_arr.mask}") # True where data is masked

# Create a masked array using an explicit mask
data_values = np.array([10., 20., 30., np.nan, 50., 60.])
mask_explicit = [False, False, False, True, False, False] # Mask the 4th element (NaN)
masked_arr_explicit = ma.array(data_values, mask=mask_explicit)
print(f"\nMasked array with explicit mask:\n{masked_arr_explicit}")
print(f"Mask part: {masked_arr_explicit.mask}")

# Mask invalid values like NaN or inf automatically
masked_invalid = ma.masked_invalid(data_values) # Automatically masks NaN and inf
print(f"\nArray masked for invalid values (NaN, inf):\n{masked_invalid}")
print(f"Mask part: {masked_invalid.mask}")

# Operations on masked arrays often ignore masked values
print(f"\nSum of valid elements in masked_arr: {masked_arr.sum()}") # Ignores -1 values
print(f"Mean of valid elements in masked_arr: {masked_arr.mean()}") # Ignores -1 values
print(f"Mean of valid elements in masked_invalid: {masked_invalid.mean()}") # Ignores NaN

# Accessing masked data returns a special 'masked' constant if the element is masked
print(f"\nElement at index 2 (valid): {masked_arr[2]}") # Output: -- (or masked)
print(f"Element at index 3 (valid): {masked_arr[3]}") # Output: 4

# Filling masked values
filled_arr = masked_arr.filled(fill_value=0) # Replace masked values with 0
print(f"\nMasked array filled with 0: {filled_arr}")

# Masked arrays are very useful in data analysis when dealing with imperfect data.
print("-" * 30)


# --- 3. Performance Considerations (Conceptual) ---

# Vectorization: Always prefer vectorized operations (element-wise ops, ufuncs, aggregations)
# over explicit Python loops on array elements. NumPy's C implementation is much faster.
# Example: Use `arr * 2` instead of `[x * 2 for x in arr]`.

# Memory Layout (C vs Fortran Order):
# NumPy arrays are stored contiguously in memory.
# C order (row-major, default): Elements of each row are contiguous. Accessing rows is faster.
# Fortran order (column-major): Elements of each column are contiguous. Accessing columns is faster.
# arr_c = np.zeros((1000, 1000), order='C')
# arr_f = np.zeros((1000, 1000), order='F')
# Accessing elements along the contiguous dimension is generally faster.
# e.g., summing rows (`arr_c.sum(axis=1)`) might be faster for C order,
# summing columns (`arr_f.sum(axis=0)`) might be faster for F order.
# Usually, the default C order is fine, but be aware if performance is critical.

# Avoid Unnecessary Copies: Be mindful of views vs. copies (Section IV).
# Operations like fancy indexing, boolean indexing, and `astype` create copies.
# Slicing usually creates views. Unnecessary copying consumes memory and time.
# Use in-place operations (e.g., `arr *= 2` instead of `arr = arr * 2`) if appropriate
# and you don't need the original array.

print("--- Performance Considerations (See Comments in Code) ---")
print("Key takeaways: Prioritize vectorization, be aware of memory layout (C/F order),")
print("and minimize unnecessary data copies.")
print("-" * 30)


# --- 4. Interoperability (Conceptual) ---

# NumPy arrays are the de facto standard for numerical data in the Python ecosystem.
# They serve as the primary data container for many other libraries:
# - Pandas: DataFrames are built upon NumPy arrays. You can easily convert between them.
# - SciPy: Extends NumPy with more advanced scientific computing functions (integration, optimization, signal processing) that operate on NumPy arrays.
# - Scikit-learn: Machine learning library that expects data primarily as NumPy arrays or SciPy sparse matrices.
# - Matplotlib/Seaborn: Visualization libraries that plot data directly from NumPy arrays.
# - TensorFlow/PyTorch: Deep learning frameworks often use NumPy arrays for data input/output and manipulation alongside their own tensor structures.

# Understanding NumPy is crucial for effectively using almost any data science or
# scientific computing library in Python.

print("--- Interoperability (See Comments in Code) ---")
print("NumPy arrays are the common language for numerical data between libraries")
print("like Pandas, SciPy, Scikit-learn, Matplotlib, etc.")
print("-" * 30)

--- Structured Arrays ---
Structured dtype definition:
[('name', '<U10'), ('age', '<i4'), ('height_m', '<f8')]

Structured array:
[('Alice', 30, 1.65) ('Bob', 25, 1.8 ) ('Charlie', 35, 1.75)]
Shape: (3,)

First record: ('Alice', 30, 1.65)

Names: ['Alice' 'Bob' 'Charlie']
Ages: [30 25 35]

Age of Bob: 25
Height of Alice: 1.65
------------------------------
--- Masked Arrays ---
Original data with -1 for missing: [ 1  2 -1  4  5 -1  7]

Masked array (where -1 is masked):
[1 2 -- 4 5 -- 7]
Data part: [ 1  2 -1  4  5 -1  7]
Mask part: [False False  True False False  True False]

Masked array with explicit mask:
[10.0 20.0 30.0 -- 50.0 60.0]
Mask part: [False False False  True False False]

Array masked for invalid values (NaN, inf):
[10.0 20.0 30.0 -- 50.0 60.0]
Mask part: [False False False  True False False]

Sum of valid elements in masked_arr: 19
Mean of valid elements in masked_arr: 3.8
Mean of valid elements in masked_invalid: 34.0

Element at index 2 (valid): --
Element at index 3 