### Data Loading 

In [None]:
pip install pandas numpy

In [5]:
import numpy as np
import pandas as pd

In [7]:
sales = pd.read_csv(r'C:\Users\wwwsu\Desktop\All folders\Logistics_demand\data\raw\sales_train_evaluation.csv')
price = pd.read_csv(r'C:\Users\wwwsu\Desktop\All folders\Logistics_demand\data\raw\sell_prices.csv')
calendar = pd.read_csv(r'C:\Users\wwwsu\Desktop\All folders\Logistics_demand\data\raw\calendar.csv')

### Optimizing Sales dataframe

In [8]:
# Convert identifier columns to category
id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
for col in id_cols:
    sales[col] = sales[col].astype('category')

# Convert sales columns to smaller int type (e.g., uint16)
sales_cols = sales.columns.difference(id_cols)
sales[sales_cols] = sales[sales_cols].astype(np.uint16)

# Check memory usage before and after
print("Sales Data Memory Usage (MB):", sales.memory_usage(deep=True).sum() / 1024**2)

Sales Data Memory Usage (MB): 116.6247148513794


### Optimizing calendar Dataframe

In [9]:
# Select essential columns only
calendar = calendar[['d', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
                     'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
                     'snap_CA', 'snap_TX', 'snap_WI']]

# Convert d to category
calendar['d'] = calendar['d'].astype('category')

# Convert date to datetime
calendar['date'] = pd.to_datetime(calendar['date'])

# Convert categorical columns
cat_cols = ['weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
for col in cat_cols:
    calendar[col] = calendar[col].astype('category')

# Convert numeric columns
calendar['wday'] = calendar['wday'].astype(np.int8)
calendar['month'] = calendar['month'].astype(np.int8)
calendar['year'] = calendar['year'].astype(np.int16)
calendar['snap_CA'] = calendar['snap_CA'].astype(np.int8)
calendar['snap_TX'] = calendar['snap_TX'].astype(np.int8)
calendar['snap_WI'] = calendar['snap_WI'].astype(np.int8)

print("Calendar Data Memory Usage (MB):", calendar.memory_usage(deep=True).sum() / 1024**2)


Calendar Data Memory Usage (MB): 0.22610950469970703


### Optimize Price Dataframe

In [10]:
price['store_id'] = price['store_id'].astype('category')
price['item_id'] = price['item_id'].astype('category')
price['wm_yr_wk'] = price['wm_yr_wk'].astype(np.int16)
price['sell_price'] = price['sell_price'].astype(np.float32)

print("Price Data Memory Usage (MB):", price.memory_usage(deep=True).sum() / 1024**2)


Price Data Memory Usage (MB): 58.96129322052002


### Cleaning the data 

In [11]:
print("Missing values in sales data:")
print(sales.isnull().sum().sum()) 

Missing values in sales data:
0


In [12]:

print("Missing values in price data:")
print(price.isnull().sum())

Missing values in price data:
store_id      0
item_id       0
wm_yr_wk      0
sell_price    0
dtype: int64


In [13]:
print("Missing values in calendar data:")
print(calendar.isnull().sum())

Missing values in calendar data:
d                  0
date               0
wm_yr_wk           0
weekday            0
wday               0
month              0
year               0
event_name_1    1807
event_type_1    1807
event_name_2    1964
event_type_2    1964
snap_CA            0
snap_TX            0
snap_WI            0
dtype: int64


Missing event names/types mean no event on those days, which is expected because most days are normal days without special events. So, these missing values are not errors but represent “no event”.

### Filling missing event values with 'No_event'

In [14]:
calendar['event_name_1'] = calendar['event_name_1'].cat.add_categories('No_event').fillna('No_event')
calendar['event_type_1'] = calendar['event_type_1'].cat.add_categories('No_event').fillna('No_event')

calendar['event_name_2'] = calendar['event_name_2'].cat.add_categories('No_event').fillna('No_event')
calendar['event_type_2'] = calendar['event_type_2'].cat.add_categories('No_event').fillna('No_event')
