# Step 1: Loading Data and Merging DataFrames

In [3]:
# Load the 'orders_products_combined' dataframe
import pandas as pd

orders_products_combined = pd.read_pickle('orders_products_combined.pkl')
print(orders_products_combined.shape)

(3421083, 18)


In [4]:
import pandas as pd

# Load the df_prods DataFrame from the provided CSV file
df_prods = pd.read_csv(r'C:\Users\Asus\Music\CareerFoundry_Python_Session\products_cleaned.csv')

# Now perform the merge
ords_prods_merge = pd.merge(orders_products_combined, df_prods, on='product_id', how='inner')

# Check the merged DataFrame
print(ords_prods_merge.head())

   order_id  user_id_x eval_set_x  order_number_x  order_dow_x  \
0     23391          7      prior              17            0   
1     19256         13      prior               4            1   
2      8382         23      prior               2            0   
3      7099         27      prior              63            3   
4     14400         36      prior              10            1   

   order_hour_of_day_x  days_since_prior_order_x  product_id  \
0                   10                      28.0     23391.0   
1                   12                       9.0     19256.0   
2                   10                       9.0      8382.0   
3                   10                       1.0      7099.0   
4                   18                       1.0     14400.0   

                                     product_name_x  aisle_id_x  ...  \
0                               Organic Fennel Bulb       100.0  ...   
1             Belgian Chocolate Chocolate Ice Cream        37.0  ...   
2 

In [12]:
ords_prods_merge.to_csv('ords_prods_merge.csv', compression='gzip')

# Create the price_label Column

In [13]:
import numpy as np
np.random.seed(42)  # Ensuring reproducibility
ords_prods_merge['price'] = np.random.uniform(1, 20, size=len(ords_prods_merge))


In [14]:
ords_prods_merge['price_label'] = ords_prods_merge['price'].apply(lambda x: 
    'High' if x > 10 else ('Low' if x < 5 else 'Medium'))

In [15]:
print(ords_prods_merge['price_label'].value_counts())

price_label
High      26090
Medium    13170
Low       10428
Name: count, dtype: int64


# Step 2: Modify busiest_day to Busiest days and Add Slowest days

In [16]:
day_orders = ords_prods_merge['order_dow_x'].value_counts()
print(day_orders)

order_dow_x
0    8748
1    8492
2    6862
5    6583
6    6536
3    6349
4    6118
Name: count, dtype: int64


In [17]:
busiest_days = list(day_orders.index[:2])  # Top 2 days
slowest_days = list(day_orders.index[-2:]) # Bottom 2 days

In [18]:
# Create a new column busiest_days
ords_prods_merge['busiest_days'] = ords_prods_merge['order_dow_x'].apply(lambda x: 
    'Busiest days' if x in busiest_days else 
    ('Slowest days' if x in slowest_days else 'Regular days'))

In [19]:
print(ords_prods_merge['busiest_days'].value_counts())

busiest_days
Regular days    19981
Busiest days    17240
Slowest days    12467
Name: count, dtype: int64


# Step 3: Check Values for Accuracy

In [20]:
print(ords_prods_merge[['order_dow_x', 'busiest_days']].head(10))

   order_dow_x  busiest_days
0            0  Busiest days
1            1  Busiest days
2            0  Busiest days
3            3  Slowest days
4            1  Busiest days
5            1  Busiest days
6            1  Busiest days
7            2  Regular days
8            1  Busiest days
9            5  Regular days


# Step 4: Create busiest_period_of_day Column

In [21]:
def label_period(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

In [22]:
# Apply this function to create busiest_period_of_day:
ords_prods_merge['busiest_period_of_day'] = ords_prods_merge['order_hour_of_day_x'].apply(label_period)

In [23]:
# Verify:
print(ords_prods_merge['busiest_period_of_day'].value_counts())

busiest_period_of_day
Afternoon    20234
Morning      16517
Evening       9560
Night         3377
Name: count, dtype: int64


# Step 5: Label the Periods Based on Order Frequency

In [24]:
# Count orders per period
period_counts = ords_prods_merge['busiest_period_of_day'].value_counts()
print(period_counts)

busiest_period_of_day
Afternoon    20234
Morning      16517
Evening       9560
Night         3377
Name: count, dtype: int64


In [25]:
# Get threshold values:
most_orders_periods = list(period_counts.index[:1])   # Highest frequency
fewest_orders_periods = list(period_counts.index[-1:])  # Lowest frequency

In [26]:
# Assign labels
ords_prods_merge['busiest_period_label'] = ords_prods_merge['busiest_period_of_day'].apply(lambda x: 
    'Most orders' if x in most_orders_periods else 
    ('Fewest orders' if x in fewest_orders_periods else 'Average orders'))

In [27]:
# Verify:
print(ords_prods_merge['busiest_period_label'].value_counts())

busiest_period_label
Average orders    26077
Most orders       20234
Fewest orders      3377
Name: count, dtype: int64


# Step 6: Clean and Structure the Notebook

# Step 7: Export the DataFrame as a Pickle File

In [28]:
ords_prods_merge.to_pickle(r'C:\Users\Asus\Music\CareerFoundry_Python_Session\ords_prods_final.pkl')

In [29]:
df = pd.read_pickle(r'C:\Users\Asus\Music\CareerFoundry_Python_Session\ords_prods_final.pkl')
print(df.shape)

(49688, 26)
