# Programming for Data Science Summary
## Chapter 08 - Large Datasets

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import sys

path = "../P08 - Large Datasets/data/store_data.csv"
df = pd.read_csv(path)

### Space Optimization
**GOAL.** Overcome memory limitations with tricks to optimize or limit space usage.

In [23]:
# Approach 1 - Change datatypes
df['Date'] = pd.to_datetime(df['Date']) # Change to date
pd.read_csv(path, parse_dates=['Date']) # Use .read_csv() method instead

for column in df.select_dtypes(include=['object']):
    df[column] = df[column].astype('category') # Change to categorical

df['Retail_price'] = df['Retail_price'].astype('int8') # Reduce Integer Sizes

# Alternatively you can use hashmaps to load data and convert in one command
hash =  {'ProductFamily_ID' : 'category',
        'ProductCategory_ID':'category',
        'ProductBrand_ID':'category',
        'ProductName_ID':'category',
        'ProductPackSKU_ID':'category',
        'Point-of-Sale_ID':'category',
        'Value_units':'int8',
        'Value_price':'float32',
        'Unit_Price':'float32',
        'Retail_price':'float32',
        'Is_Promo':'bool'}
pd.read_csv(path, dtype=hash)

display()


In [None]:
# Remark: To check memory usage in more details, use the memory_usage="deep" option when using .info() method
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300547 entries, 0 to 1300546
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   Unnamed: 0          1300547 non-null  int64         
 1   ProductFamily_ID    1300547 non-null  category      
 2   ProductCategory_ID  1300547 non-null  category      
 3   ProductBrand_ID     1300547 non-null  category      
 4   ProductName_ID      1300547 non-null  category      
 5   ProductPackSKU_ID   1300547 non-null  category      
 6   Point-of-Sale_ID    1300547 non-null  category      
 7   Date                1300547 non-null  datetime64[ns]
 8   Value_units         1300547 non-null  float64       
 9   Value_price         1300547 non-null  float64       
 10  Unit_Price          1300547 non-null  float64       
 11  Retail_price        1300547 non-null  int8          
 12  Is_Promo            1300547 non-null  int64         
dtypes: category(

In [None]:
# Approach 2 - Chunking
df_chunks = pd.read_csv("../P08 - Large Datasets/data/store_data.csv", chunksize=1000, engine="c") # Load data in chunks of 50 with C engine

for chunk in df_chunks:
    # Interact with chunk here
    break

# Alternatively make a list of chunks
chunks = []
for chunk in df_chunks:
    chunks.append(chunk)
    break

In [None]:
# Approach 3 - Deleting variables when it becomes useless (similar to using malloc's free )
df_chunks = pd.read_csv("../P08 - Large Datasets/data/store_data.csv", chunksize=1000, engine="c") # Load data in chunks of 50 with C engine

chunks = []
for chunk in df_chunks:
    chunks.append(chunk)

del chunk, df_chunks

In [None]:
# Remark - If you want to see a particular variable's memory usage, you have to use the sys module
memory_usage_MB = sys.getsizeof(df) / (1024 * 1024) # Convert to MB
print(f'Memory usage: {memory_usage_MB:.2f} MB')

Memory usage: 566.40 MB


### Time Optimization
Time is money. So we want to reduce our time usage as low as possible, without touching our algorithms at all (i.e. not changing our theoretical time complexity).

In [None]:
# Approach 1 - Apply .itertuples() method to iterate over rows instead of .iterrows()
for row in df.itertuples():
    # Apply stuff to rows
    break

In [33]:
# Approach 2 - Use NumPy vectorization wherever possible, using the .values attribute
df['total_cost'] = df['Value_price'].values * df['Value_units'].values # To access columns convert them to arrays

df_numpy = df.to_numpy() # Convert dataframe to array
column_dict = {column: i for i, column in enumerate(df.columns)} # Conserve columns and index to preserve interpretability
row_dict = {row: i for i, row in enumerate(df.index)}

In [29]:
# Approach 3 - Use dataframe filtering with standard method or using .loc() method; avoid using .query() method
df[df['Value_price'].values > 1000]
df.loc[df['Value_price'].values > 1000]

display()

In [None]:
# Remark - If you want to measure time, you have two methods:
%time # Jupyter will automatically measure time and display it

start_time = time.time() # Calculate time starting from a certain point
pass
final_time = time.time() - start_time # Calculate time end to a certain point



CPU times: total: 0 ns
Wall time: 0 ns
