# Day 7: Celebrity Product Drops Sales Performance Analysis

You are a Product Analyst working on Nike's marketing performance team. Your team wants to evaluate the effectiveness of celebrity product collaborations by analyzing sales data. You will investigate the performance of celebrity product drops to inform future marketing strategies.

In [None]:
import pandas as pd
import numpy as np

fct_sales_data = [
  {
    "sale_id": 1,
    "sale_date": "2025-01-10",
    "product_id": 901,
    "sale_amount": null,
    "celebrity_id": 101
  },
  {
    "sale_id": 2,
    "sale_date": "2025-01-15",
    "product_id": 901,
    "sale_amount": 1500,
    "celebrity_id": 101
  },
  {
    "sale_id": 3,
    "sale_date": "2025-02-03",
    "product_id": 902,
    "sale_amount": 2000.5,
    "celebrity_id": 102
  },
  {
    "sale_id": 4,
    "sale_date": "2025-03-12",
    "product_id": 903,
    "sale_amount": 2500.75,
    "celebrity_id": 103
  },
  {
    "sale_id": 5,
    "sale_date": "2025-03-20",
    "product_id": 904,
    "sale_amount": null,
    "celebrity_id": 104
  },
  {
    "sale_id": 6,
    "sale_date": "2025-02-28",
    "product_id": 901,
    "sale_amount": 1000,
    "celebrity_id": 101
  },
  {
    "sale_id": 7,
    "sale_date": "2025-03-25",
    "product_id": 902,
    "sale_amount": 300,
    "celebrity_id": 102
  },
  {
    "sale_id": 8,
    "sale_date": "2025-03-30",
    "product_id": 905,
    "sale_amount": 1800,
    "celebrity_id": 105
  },
  {
    "sale_id": 9,
    "sale_date": "2025-01-20",
    "product_id": 903,
    "sale_amount": 1200,
    "celebrity_id": 103
  },
  {
    "sale_id": 10,
    "sale_date": "2025-02-05",
    "product_id": 906,
    "sale_amount": 500,
    "celebrity_id": 106
  },
  {
    "sale_id": 11,
    "sale_date": "2025-03-01",
    "product_id": 907,
    "sale_amount": 2200,
    "celebrity_id": 107
  },
  {
    "sale_id": 12,
    "sale_date": "2025-02-15",
    "product_id": 908,
    "sale_amount": 1300,
    "celebrity_id": 101
  },
  {
    "sale_id": 13,
    "sale_date": "2025-03-15",
    "product_id": 909,
    "sale_amount": null,
    "celebrity_id": 102
  },
  {
    "sale_id": 14,
    "sale_date": "2025-01-25",
    "product_id": 910,
    "sale_amount": 900,
    "celebrity_id": 108
  },
  {
    "sale_id": 15,
    "sale_date": "2025-02-20",
    "product_id": 905,
    "sale_amount": 700,
    "celebrity_id": 105
  },
  {
    "sale_id": 16,
    "sale_date": "2025-03-28",
    "product_id": 902,
    "sale_amount": 1500,
    "celebrity_id": 102
  },
  {
    "sale_id": 17,
    "sale_date": "2024-11-15",
    "product_id": 901,
    "sale_amount": 800,
    "celebrity_id": 101
  },
  {
    "sale_id": 18,
    "sale_date": "2024-07-30",
    "product_id": 902,
    "sale_amount": 1000,
    "celebrity_id": 102
  },
  {
    "sale_id": 19,
    "sale_date": "2025-04-10",
    "product_id": 905,
    "sale_amount": 2000,
    "celebrity_id": 105
  },
  {
    "sale_id": 20,
    "sale_date": "2024-09-05",
    "product_id": 903,
    "sale_amount": 1100,
    "celebrity_id": 103
  }
]
fct_sales = pd.DataFrame(fct_sales_data)


## Question 1

For Q1 2025 (January 1st through March 31st, 2025), can you identify all records of celebrity collaborations from the sales data where the sale_amount is missing? This will help us flag incomplete records that could impact the analysis of Nike's product performance.

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: fct_sales
# Please print your final result or dataframe

# Load the sales data
q1_fct_sales_df = fct_sales.copy()
print(q1_fct_sales_df)
print("=" * 150)
print()

# Sanity checks and initial exploration
print(q1_fct_sales_df.info())
print("=" * 150)
print()

# First we need to change the sale_date column to datetime format
q1_fct_sales_df["sale_date"] = pd.to_datetime(q1_fct_sales_df["sale_date"], format="%Y-%m-%d")
print(q1_fct_sales_df.info())
print()
print(q1_fct_sales_df.head())
print("=" * 150)
print()

# Now we sort the dataframe by year and month in ascending order for better readability
q1_fct_sales_df = q1_fct_sales_df.sort_values(['sale_date'], ascending=True).reset_index(drop=True)
print(q1_fct_sales_df)
print("=" * 150)
print()

# Now we filter the dataframe for Q1 2025 which is January 1st through March 31st, 2025
Q1_df = q1_fct_sales_df[(q1_fct_sales_df['sale_date'] >= '2025-01-01') & (q1_fct_sales_df['sale_date'] <= '2025-03-31')]
print(Q1_df)
print("=" * 150)
print()

# We can see there are 3 records with missing sale_amount in Q1 2025
 #Now we select and filter only those records with missing sale_amount
Q1_df = Q1_df[Q1_df['sale_amount'].isnull()]
print(Q1_df)
print("=" * 150)
print()

# Question 1 Answer: There are 3 records with missing sale_amount in Q1 2025.'
print("Question 1 Answer: There are", len(Q1_df), "records with missing sale_amount in Q1 2025.");
print("These records are:");
print(Q1_df)
print("=" * 150)
print()

## Question 2

For Q1 2025 (January 1st through March 31st, 2025), can you list the unique combinations of celebrity_id and product_id from the sales table? This will ensure that each collaboration is accurately accounted for in the analysis of Nike's marketing performance.

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: fct_sales
# Please print your final result or dataframe

# Load the sales data
q1_fct_sales_df = fct_sales.copy()
print(q1_fct_sales_df)
print("=" * 150)
print()

# Sanity checks and initial exploration
print(q1_fct_sales_df.info())
print("=" * 150)
print()

# First we need to change the sale_date column to datetime format
q1_fct_sales_df["sale_date"] = pd.to_datetime(q1_fct_sales_df["sale_date"], format="%Y-%m-%d")
print(q1_fct_sales_df.info())
print()
print(q1_fct_sales_df.head())
print("=" * 150)
print()

# Now we sort the dataframe by year and month in ascending order for better readability
q1_fct_sales_df = q1_fct_sales_df.sort_values(['sale_date'], ascending=True).reset_index(drop=True)
print(q1_fct_sales_df)
print("=" * 150)
print()

# Now we filter the dataframe for Q1 2025 which is January 1st through March 31st, 2025
Q1_df = q1_fct_sales_df[(q1_fct_sales_df['sale_date'] >= '2025-01-01') & (q1_fct_sales_df['sale_date'] <= '2025-03-31')]
print(Q1_df)
print("=" * 150)
print()

# We can see there are 3 records with missing sale_amount in Q1 2025
 #Now we select and filter only those records with missing sale_amount
Q1_missing_df = Q1_df[Q1_df['sale_amount'].isnull()]
print(Q1_missing_df)
print("=" * 150)
print()

# Question 1 Answer: There are 3 records with missing sale_amount in Q1 2025.'
print("Question 1 Answer: There are", len(Q1_missing_df), "records with missing sale_amount in Q1 2025.");
print("These records are:");
print(Q1_missing_df)
print("=" * 150)
print()


# ==============================================================================
print()
print("=" * 150)
print("=" * 150)
print()
# ==============================================================================

# Printing Q1 dataframe and its info for verification
print(Q1_df)
print(Q1_df.info())
print("=" * 150)
print()

# We can use groupby to get the unique combinations of celebrity_id and product_id
Q1_unique_combinations = Q1_df.groupby(['celebrity_id', 'product_id']).size().reset_index(name='count')
print(Q1_unique_combinations)


# Question 2 Answer: The following table shows the unique combinations of celebrity_id and product_id in Q1 2025.
print("Question 2 Answer: The following table shows the unique combinations of celebrity_id and product_id in Q1 2025.")
print(Q1_unique_combinations)

## Question 3

For Q1 2025 (January 1st through March 31st, 2025), can you rank the unique celebrity collaborations based on their total sales amounts and list the top 3 collaborations in descending order? This will help recommend the most successful partnerships for Nike's future product drop strategies.

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: fct_sales
# Please print your final result or dataframe

# Question 1

# Load the sales data
q1_fct_sales_df = fct_sales.copy()
print(q1_fct_sales_df)
print("=" * 150)
print()

# Sanity checks and initial exploration
print(q1_fct_sales_df.info())
print("=" * 150)
print()

# First we need to change the sale_date column to datetime format
q1_fct_sales_df["sale_date"] = pd.to_datetime(q1_fct_sales_df["sale_date"], format="%Y-%m-%d")
print(q1_fct_sales_df.info())
print()
print(q1_fct_sales_df.head())
print("=" * 150)
print()

# Now we sort the dataframe by year and month in ascending order for better readability
q1_fct_sales_df = q1_fct_sales_df.sort_values(['sale_date'], ascending=True).reset_index(drop=True)
print(q1_fct_sales_df)
print("=" * 150)
print()

# Now we filter the dataframe for Q1 2025 which is January 1st through March 31st, 2025
Q1_df = q1_fct_sales_df[(q1_fct_sales_df['sale_date'] >= '2025-01-01') & (q1_fct_sales_df['sale_date'] <= '2025-03-31')]
print(Q1_df)
print("=" * 150)
print()

# We can see there are 3 records with missing sale_amount in Q1 2025
 #Now we select and filter only those records with missing sale_amount
Q1_missing_df = Q1_df[Q1_df['sale_amount'].isnull()]
print(Q1_missing_df)
print("=" * 150)
print()

# Question 1 Answer: There are 3 records with missing sale_amount in Q1 2025.'
print("Question 1 Answer: There are", len(Q1_missing_df), "records with missing sale_amount in Q1 2025.");
print("These records are:");
print(Q1_missing_df)
print("=" * 150)
print()


# ==============================================================================
print()
print("=" * 150)
print("=" * 150)
print()
# ==============================================================================

#Question 2

# Printing Q1 dataframe and its info for verification
print(Q1_df)
print(Q1_df.info())
print("=" * 150)
print()

# We can use groupby to get the unique combinations of celebrity_id and product_id
Q1_unique_combinations = Q1_df.groupby(['celebrity_id', 'product_id']).size().reset_index(name='count')
print(Q1_unique_combinations)
print("=" * 150)
print()

# Question 2 Answer: The following table shows the unique combinations of celebrity_id and product_id in Q1 2025.
print("Question 2 Answer: The following table shows the unique combinations of celebrity_id and product_id in Q1 2025.")
print(Q1_unique_combinations)
print("=" * 150)
print()


# ==============================================================================
print()
print("=" * 150)
print("=" * 150)
print()
# ==============================================================================

#Question 3

# Printing Q1 dataframe and its info for verification
print(Q1_df)
print(Q1_df.info())
print("=" * 150)
print()

# We can use groupby to get the unique combinations of celebrity_id and product_id
Q1_sales_collabs = Q1_df.groupby(['celebrity_id', 'product_id']).agg(total_sales_volume = ('sale_amount', 'sum')).sort_values(by=['celebrity_id', 'product_id']).reset_index()
print(Q1_sales_collabs)
print("=" * 150)
print()

# Now we rank the collaborations based on total sales volume in descending order
Q1_ranked_collabs = Q1_sales_collabs.copy()
Q1_ranked_collabs = Q1_ranked_collabs.sort_values(by=['total_sales_volume'], ascending=False).reset_index(drop=True)
print(Q1_ranked_collabs)
print("=" * 150)
print()

# We can use .head() to get the top 3 collaborations now that it is ranked
top_3_collabs = Q1_ranked_collabs.head(3)
print(top_3_collabs)
print("=" * 150)
print()

# Answer to Question 3: The top 3 celebrity-product collaborations based on total sales volume in Q1 2025 are shown in the table above.
print("Question 3 Answer: The top 3 collaborations based on total sales volume in Q1 2025 are:")
print(top_3_collabs)
print("=" * 150)
print()

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)