In [35]:
# import the pandas library as pd to work on data
import pandas as pd

In [36]:
# loading the orders and customers data into pandas for analysis
df_orders, df_customers, new_customers, new_orders = map(
    pd.read_csv,
    ['/content/orders.csv',
     '/content/customers.csv',
     '/content/new_customers.csv',
     '/content/new_orders.csv']
    )

# printing the first 5 rows of the two datasets
df_orders.head()

Unnamed: 0,Order_ID,Customer_ID,Order_Date,Product_Name,Category,Quantity,Unit_Price ($),Total_Amount ($)
0,10001,5263,2024-07-31,Laptop,Electronics,1,251.96,251.96
1,10002,3027,2024-12-01,Laptop,Electronics,1,10.43,10.43
2,10003,4990,2023-11-01,Vacuum Cleaner,Home & Kitchen,5,432.74,2163.7
3,10004,4225,2023-08-26,Comics,Books,3,16.21,48.63
4,10005,4921,2024-01-26,T-Shirt,Clothing,1,13.43,13.43


In [37]:
df_customers.head()

Unnamed: 0,Customer_ID,Name,Email,Join_Date,Location,Membership_Level
0,2001,Liam,user2001@example.com,2024-09-09,Houston,Silver
1,2002,David,user2002@example.com,2022-09-15,Chicago,Silver
2,2003,Mason,user2003@example.com,2022-12-20,Miami,Gold
3,2004,Emma,user2004@example.com,2023-01-24,Los Angeles,Standard
4,2005,Liam,user2005@example.com,2023-09-11,Miami,Standard


In [38]:
df_orders.shape, df_customers.shape

((10000, 8), (5000, 6))

# 1️⃣ Basic Merging & Joining

In [39]:
# performing an inner join on the two datasets on the customer_id column
inner_df = pd.merge(df_orders, df_customers, on='Customer_ID')
inner_df.head()

# performing a left join on the same column
left_df = pd.merge(df_orders, df_customers, on='Customer_ID', how='left')
left_df.head()

# performing a right join on the same column
right_df = pd.merge(df_orders, df_customers, on='Customer_ID', how='right')
right_df.head()

# performing an outer join on the same column
outer_df = pd.merge(df_orders, df_customers, on='Customer_ID', how='outer')
outer_df.head()

Unnamed: 0,Order_ID,Customer_ID,Order_Date,Product_Name,Category,Quantity,Unit_Price ($),Total_Amount ($),Name,Email,Join_Date,Location,Membership_Level
0,12588.0,2001,2025-03-18,Tablet,Electronics,2.0,479.31,958.62,Liam,user2001@example.com,2024-09-09,Houston,Silver
1,18148.0,2001,2024-08-15,Tablet,Electronics,3.0,100.37,301.11,Liam,user2001@example.com,2024-09-09,Houston,Silver
2,11868.0,2002,2025-01-07,Jeans,Clothing,5.0,250.81,1254.05,David,user2002@example.com,2022-09-15,Chicago,Silver
3,15298.0,2002,2024-09-03,Foundation,Beauty,1.0,134.02,134.02,David,user2002@example.com,2022-09-15,Chicago,Silver
4,,2003,,,,,,,Mason,user2003@example.com,2022-12-20,Miami,Gold


# 2️⃣ Advanced Merging with Multiple Keys

In [40]:
# subsetting the outer_df dataframe to find the customers who didn't order
no_orders = outer_df[outer_df['Order_ID'].isna()]
print(
    f'{no_orders.shape[0]} customers out of {df_customers.shape[0]} placed no orders'
    )

# finding orders with no customer ID
orders_with_no_customers = outer_df[
    (outer_df['Customer_ID'].isna()) & (outer_df['Order_ID'].notna())
    ]

print(
    f'{orders_with_no_customers.shape[0]} orders out of {df_orders.shape[0]} had no customers'
    )

# count the number of customers with over 5 orders
over_5 = inner_df[inner_df['Quantity'] > 5.0]

print(
    f'The number of customers with orders above 5 are {over_5.shape[0]}'
)

#Counting how many Gold & Platinum customers have placed an order.
gold_platinum = inner_df[
    (inner_df['Membership_Level'] == 'Gold') | (inner_df['Membership_Level'] == 'Platinum')
    ]

print(
    f'The number of customers with Gold or Platinum membership are {gold_platinum.shape[0]}'
)

660 customers out of 5000 placed no orders
0 orders out of 10000 had no customers
The number of customers with orders above 5 are 0
The number of customers with Gold or Platinum membership are 2004


# 3️⃣ Using `.concat()` for Stacking Data

In [47]:
# stacking the datasets using `concat()`
concat_customers_df = pd.concat([df_customers, new_customers])
concat_customers_df.head()

concat_orders_df = pd.concat([df_orders, new_orders])
concat_orders_df.head()

Unnamed: 0,Order_ID,Customer_ID,Order_Date,Product_Name,Category,Quantity,Unit_Price ($),Total_Amount ($)
0,10001,5263,2024-07-31,Laptop,Electronics,1,251.96,251.96
1,10002,3027,2024-12-01,Laptop,Electronics,1,10.43,10.43
2,10003,4990,2023-11-01,Vacuum Cleaner,Home & Kitchen,5,432.74,2163.7
3,10004,4225,2023-08-26,Comics,Books,3,16.21,48.63
4,10005,4921,2024-01-26,T-Shirt,Clothing,1,13.43,13.43


In [48]:
#checking for duplicates
if concat_customers_df.duplicated().sum() > 0:
  print(
      f'There are {concat_customers_df.duplicated().sum()} duplicates in the concat_customers_df dataframe'
      )
else:
  print('There are no duplicates in the concat_customers_df dataframe')

if concat_orders_df.duplicated().sum() > 0:
  print(
      f'There are {concat_orders_df.duplicated().sum()} duplicates in the concat_orders_df dataframe'
      )
else:
  print('There are no duplicates in the concat_orders_df dataframe')


There are no duplicates in the concat_customers_df dataframe
There are no duplicates in the concat_orders_df dataframe
