### 4.10.5 Downsizing Coding Etiquette & Excel Reporting 

### Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

### Setting Path

In [2]:
path = r'\Users\edwin\Instacart Basket Analysis'
path

'\\Users\\edwin\\Instacart Basket Analysis'

#### Importing the Data

In [4]:
# Load your sampled data
ords_prods_all_in = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'high_activity_customers.pkl'))


In [5]:
# Check columns
ords_prods_all_in.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'add_to_cart_order', 'reordered', 'existing_merge', 'merge_1',
       'price_range_loc', 'busiest_day', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_product_price', 'spending_flag', 'customer_frequency',
       'frequency_flag', 'First Name', 'Surnam', 'Gender', 'state', 'Age',
       'date_joined', 'n_dependants', 'fam_status', 'income', 'merge_2',
       'region', 'exclusion_flag'],
      dtype='object')

In [6]:
# Drop specific columns
columns_to_drop = ['order_number', 'product_id', 'add_to_cart_order', 'product_name', 'aisle_id']
df_dropped = ords_prods_all_in.drop(columns=columns_to_drop)
df_dropped.columns

Index(['department_id', 'prices', 'order_id', 'user_id', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'reordered', 'existing_merge', 'merge_1', 'price_range_loc',
       'busiest_day', 'busiest_days', 'busiest_period_of_day', 'max_order',
       'loyalty_flag', 'mean_product_price', 'spending_flag',
       'customer_frequency', 'frequency_flag', 'First Name', 'Surnam',
       'Gender', 'state', 'Age', 'date_joined', 'n_dependants', 'fam_status',
       'income', 'merge_2', 'region', 'exclusion_flag'],
      dtype='object')

In [7]:
df = df_dropped


In [8]:
# Creating a randomized list
np.random.seed(4)
dev = np.random.rand(len(df)) <= 0.6

In [9]:
# Store 60% of the sample in dataframe big
big = df[dev] 

# Store 40% of the sample in dataframe small
small = df[~dev]


In [10]:
# Check sizes
print("Total size:", len(df))
print("Size of 'big':", len(big))
print("Size of 'small':", len(small))


Total size: 30964564
Size of 'big': 18579523
Size of 'small': 12385041


In [15]:
# Use 'small' as the downsampled DataFrame
ords_prods_all_in_small = small


In [16]:
# Check the shape of the smaller DataFrame
print(ords_prods_all_in_small.shape)

(12385041, 33)


In [17]:
ords_prods_all_in.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32404858
Data columns (total 38 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            category
 2   aisle_id                int16   
 3   department_id           int16   
 4   prices                  float32 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int32   
 8   orders_day_of_week      int8    
 9   order_time_of_day       int8    
 10  days_since_prior_order  float32 
 11  first_order             int8    
 12  add_to_cart_order       int16   
 13  reordered               int8    
 14  existing_merge          category
 15  merge_1                 category
 16  price_range_loc         category
 17  busiest_day             category
 18  busiest_days            category
 19  busiest_period_of_day   category
 20  max_order               int32   
 21  loyalty_fla

In [19]:
# Example: Downcast numeric columns
numeric_cols = ords_prods_all_in.select_dtypes(include=['int', 'float']).columns
ords_prods_all_in[numeric_cols] = ords_prods_all_in[numeric_cols].apply(pd.to_numeric, downcast='integer')

# Check the data types
print(ords_prods_all_in.info())


<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32404858
Data columns (total 38 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            category
 2   aisle_id                int16   
 3   department_id           int16   
 4   prices                  float32 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int8    
 8   orders_day_of_week      int8    
 9   order_time_of_day       int8    
 10  days_since_prior_order  float32 
 11  first_order             int8    
 12  add_to_cart_order       int16   
 13  reordered               int8    
 14  existing_merge          category
 15  merge_1                 category
 16  price_range_loc         category
 17  busiest_day             category
 18  busiest_days            category
 19  busiest_period_of_day   category
 20  max_order               int8    
 21  loyalty_fla

In [20]:
# Example: Convert object columns to categorical if they have limited unique values
object_cols = ords_prods_all_in.select_dtypes(include=['object']).columns
for col in object_cols:
    if len(ords_prods_all_in[col].unique()) < len(ords_prods_all_in) * 0.5:  # Adjust threshold as needed
        ords_prods_all_in[col] = ords_prods_all_in[col].astype('category')

# Check the data types
print(ords_prods_all_in.info())

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32404858
Data columns (total 38 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            category
 2   aisle_id                int16   
 3   department_id           int16   
 4   prices                  float32 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int8    
 8   orders_day_of_week      int8    
 9   order_time_of_day       int8    
 10  days_since_prior_order  float32 
 11  first_order             int8    
 12  add_to_cart_order       int16   
 13  reordered               int8    
 14  existing_merge          category
 15  merge_1                 category
 16  price_range_loc         category
 17  busiest_day             category
 18  busiest_days            category
 19  busiest_period_of_day   category
 20  max_order               int8    
 21  loyalty_fla

### Export Dataframe

In [21]:
# Export the smaller dataframe
small.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'high_activity_customers_small.pkl'))
