In [None]:
import pandas as pd
import numpy as np

In [None]:
customers = pd.read_csv("customers.csv")
orders = pd.read_csv("orders.csv")
order_items = pd.read_csv("order_items.csv")

In [None]:
customers["last_name"].str.strip().sort_values()

In [None]:
orders.sort_values('order_date', ascending = False)

In [None]:
ny_customers = customers[customers["city"].str.lower().isin(["new york", "nyc"])]
ny_customers.head()
#filtering rows we want to grab

In [None]:
recent_orders = orders[orders["order_date"] > "2024-02-14"]
recent_orders.head()

In [None]:
high_qty = order_items[order_items["quantity"] >= 2].sort_values("quantity")
high_qty.head()
#464 rows Ã— 4 columns

In [None]:
customers["first_name"] = customers["first_name"].str.strip().str.capitalize()
customers["last_name"] = customers["last_name"].str.strip().str.capitalize()


In [None]:
customers.sort_values("last_name")

In [None]:
customers["city"].value_counts()

In [None]:
city_standardization_mapping = {'LA': 'Los Angeles','NYC': 'New York'}

In [None]:
customers["city"] = customers["city"].replace(city_standardization_mapping)

In [None]:
customers["city"].str.capitalize()
customers["city"] = customers["city"].str.title()

# using.apply()

In [None]:
customers["clean_last_name"] = customers["last_name"].apply(
    lambda x:
    x.strip()
)
# used apply on each value and assigned it to new clean last name column

In [None]:
customers["clean_last_name"].value_counts()

In [None]:
order_items["total_price"] = order_items["quantity"] * order_items["unit_price"]
order_items

In [None]:
order_items["total_price_applied"] = order_items.apply(
    lambda row:
    row["quantity"] * row["unit_price"], axis = 1
)
order_items
#here we are feeding entire dataset 

In [None]:
order_items["order_type"] = order_items.apply(
    lambda row:(
        "Large Expensive Order" if row["quantity"] >=3 and row["unit_price"] >50
        else "Bulk Order" if row["quantity"] >=3
        else "Standard Order"
    ), axis =1
)
order_items
# for each row pass the row in the function we provided

# Merging Tables

In [None]:
orders_with_customers = orders.merge(customers, how = "inner")
orders_with_customers.head()

In [None]:
orders_customers_items = orders_with_customers.merge(order_items, on = "order_id", how ="inner")
orders_customers_items.head()

In [None]:
combined = (
    orders.merge(customers, on ="customer_id", how ="inner").merge(order_items, on = "order_id", how = "inner")
)
combined

# Grouping and Aggregation

In [None]:
order_items["total_price"].sum()

In [None]:
order_items.groupby("product_name")["total_price"].sum().sort_values(ascending=False)

In [None]:
order_items.groupby("product_name")["quantity"].sum().sort_values(ascending=False)

In [None]:
order_items.groupby("product_name").agg(
    total_revenue = ("total_price", "sum"),
    avg_quantity = ("quantity","mean")
)

In [None]:
combined.groupby("customer_id")["total_price"].sum().sort_values(ascending=False).head()

# Reshaping Data

In [None]:
sample = combined[["customer_id", "product_name","total_price"]].head(12)
sample

In [None]:
pivot = sample.pivot_table(index = "customer_id", columns ="product_name", values = "total_price", aggfunc = "sum")
pivot
#pivot = sample.pivot_table(index="customer_id", columns = "product_name", values="total_price", aggfunc="sum")

In [None]:
pivot.reset_index().melt(
    id_vars = "customer_id",
    var_name = "product_name",
    value_name = "revenue_name"
)

In [None]:
combined.query("quantity>=2").groupby("city").agg(total_revenue=("total_price", "sum")).sort_values("total_revenue",ascending=False)