# Imports

In [1]:
import struct
import mmap 
import gc 
import numpy as np
import array
import sys
from memory_profiler import memory_usage
import time 
import pandas as pd
import numpy as np

# Topics

## Variables packing with struct

In this example, using the ```struct``` module packs integers into a compact binary format, making data processing more efficient

In [2]:


# Packing two integers into a binary format
packed_data = struct.pack('ii', 10, 20)

# Unpacking the packed binary data
a, b = struct.unpack('ii', packed_data)

In [3]:
a , b

(10, 20)

## Faster I/O operations with mmap 

Memory-mapped files allow you to treat disk storage as if it were memory, speeding up access times for large files

In [4]:
# Memory-mapping a file
with open("19- test.txt", "r+b") as f:
    mmapped_file = mmap.mmap(f.fileno(), 0)
    print(mmapped_file.read())
    mmapped_file.close()

b'line1\r\nline2\r\nline3\r\neof'


## Fixed Vs Variable Size Arrays 

In [5]:
# Using fixed-length array for performance
fixed_array = array.array('i', [1, 2, 3, 4, 5])

# Dynamic list (variable-length)
dynamic_list = [1, 2, 3, 4, 5]

## Loops Optmization through Vectorization

In [6]:
arr = np.array([1, 2, 3, 4, 5], dtype=np.int32)

result = arr * 2
result

array([ 2,  4,  6,  8, 10])

## Short Circuit Conditions

In [7]:
def complex_condition(x, y):
    return x != 0 and y / x > 2  # Stops evaluation if x is 0

complex_condition(0, 1000000000)  # No division by zero error


False

## Free Up Memory with Garbage Collection

In [8]:
# Manual garbage collection to free up memory
large_data = [i for i in range(1_000_000)]
del large_data
gc.collect()  # Forces garbage collection

0

## Internal Vs Public Functions

Internal functions are those intended to be used only within the module where they are defined, often optimised for speed and efficiency. Public functions are exposed for external use and may include additional error handling or logging, making them slightly less efficient.

In [9]:
def _private_function(data):
    # Optimized for internal use, with minimal error handling
    return data ** 2

def public_function(data):
    # Includes additional checks for external use
    if isinstance(data, int):
        return _private_function(data)
    raise ValueError("Input must be an integer")

# Example of using the public function
try:
    print(public_function(5))
    print(public_function('a'))
except Exception as e:
    print(e)
    

25
Input must be an integer


## Memory Optimizations 

### 2 ways to track memory consumption

In [10]:
# 1- sys.getsizeof 
size = 10**7
some_array = [i for i in range(size)]
some_gen = (i for i in range(size))

print(f"Size of array of len {size} is {(sys.getsizeof(some_array)/(1024*1024)):.2f} Mbs")


print(f"Size of generator of len {size} is {(sys.getsizeof(some_gen)/(1024*1024)):.2f} MBs")

some_array = array.array('i',[i for i in range(size)])
print(f"Size of array of len {size}  made from array lib is {(sys.getsizeof(some_array)/(1024*1024)):.2f} Mbs")

some_np_array = np.array(some_array)
print(f"Size of numpy array of len {size} is {(sys.getsizeof(some_np_array)/(1024*1024)):.2f} MBs")




Size of array of len 10000000 is 84.97 Mbs
Size of generator of len 10000000 is 0.00 MBs
Size of array of len 10000000  made from array lib is 38.15 Mbs
Size of numpy array of len 10000000 is 38.15 MBs


In [11]:
# 2- memory_usage from memory profiler
# memory profiler is more suited with functions behaviour 

def my_function():
    a = 'Towards Data Science' * (10**7)
    return a

mem_usage = memory_usage((my_function,))
print(f"Memory usage: {max(mem_usage) - min(mem_usage)} MB")

Memory usage: 173.01171875 MB


In [12]:
def timer(func):
    def wrapper(*args):
        start = time.perf_counter()
        result = func(*args)
        end = time.perf_counter()
        print(f"Function {func.__name__} took {(end - start):.2f} seconds")
        return result
    return wrapper


@timer
def make_list(size):
    return sum([i for i in range(size)])

@timer
def make_gen(size):
    return sum((i for i in range(size)))

In [13]:
make_list(10**7)

Function make_list took 1.04 seconds


49999995000000

In [14]:
make_gen(10**7)

Function make_gen took 0.97 seconds


49999995000000

In [15]:
print("Metrics for the list version")
mem_usage = memory_usage((make_list, (size,)), interval=0.1)

print("Using lists:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
# Generator version implementation
print("Metrics for the generator version")

mem_usage = memory_usage((make_gen, (size,)), interval=0.1)


print("Using generators:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")


Metrics for the list version
Function make_list took 1.03 seconds
Using lists:
Memory usage: 381.46 MB
Metrics for the generator version
Function make_gen took 0.94 seconds
Using generators:
Memory usage: 0.00 MB


**NOTE** : Approximately close timing , but huge memory difference

### Data Structure Matters

In [16]:
@timer
def process_with_dataframe(size):
    data = pd.DataFrame({'numbers': range(size)})
    result = data['numbers'].sum()
    return result

@timer
def process_with_series(size):
    data = pd.Series(range(size))
    result = data.sum()
    return result

In [17]:
print("Metrics for the DataFrame")
mem_usage = memory_usage((process_with_dataframe, (size,)), interval=0.1)

print("Using Dataframes:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
# Generator version implementation
print("Metrics for the Series ")

mem_usage = memory_usage((process_with_series, (size,)), interval=0.1)


print("Using Series:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")


Metrics for the DataFrame
Function process_with_dataframe took 0.06 seconds
Function process_with_dataframe took 0.05 seconds
Using Dataframes:
Memory usage: 98.17 MB
Metrics for the Series 
Function process_with_series took 0.03 seconds
Function process_with_series took 0.03 seconds
Function process_with_series took 0.03 seconds
Using Series:
Memory usage: 51.41 MB


In [18]:
# categorical data helps too instead of strings 

df = pd.DataFrame({
    'A': np.random.choice(['foo', 'bar', 'baz', 'qux'], size=size),
    'B': range(10**7)
})

mem_before = sys.getsizeof(df)
print(f"Memory usage for string type: {mem_before / (1024 * 1024):.2f} MB")

Memory usage for string type: 572.20 MB


In [19]:
df['A'] = df['A'].astype('category')

mem_after = sys.getsizeof(df)
print(f"Memory usage for categorical: {mem_after / (1024 * 1024):.2f} MB")

Memory usage for categorical: 85.83 MB


In [20]:
df['A'].cat.categories

Index(['bar', 'baz', 'foo', 'qux'], dtype='object')

In [21]:
df['A'].cat.codes[:10]

0    2
1    0
2    0
3    0
4    3
5    0
6    2
7    0
8    1
9    2
dtype: int8