# Day 9: Instagram Stories Daily User Creation Patterns

You are a Product Analyst on the Instagram Stories team investigating story creation patterns. The team wants to understand the distribution of stories created by users daily. You will analyze user storytelling behavior to optimize engagement strategies.

In [None]:
import pandas as pd
import numpy as np

stories_data_data = [
  {
    "user_id": "user_001",
    "story_date": "2024-07-03",
    "story_count": 3
  },
  {
    "user_id": "user_001",
    "story_date": "2024-07-03",
    "story_count": 3
  },
  {
    "user_id": "user_001",
    "story_date": "2024-08-15",
    "story_count": 5
  },
  {
    "user_id": "user_001",
    "story_date": "2024-09-10",
    "story_count": 0
  },
  {
    "user_id": "user_001",
    "story_date": "2024-10-05",
    "story_count": 20
  },
  {
    "user_id": "user_001",
    "story_date": "07/15/2024",
    "story_count": 2
  },
  {
    "user_id": "user_002",
    "story_date": "2024-07-03",
    "story_count": 4
  },
  {
    "user_id": " user_002",
    "story_date": "2024-07-04",
    "story_count": 3
  },
  {
    "user_id": "user_002",
    "story_date": null,
    "story_count": 6
  },
  {
    "user_id": "user_002",
    "story_date": "2024-12-25",
    "story_count": 1
  },
  {
    "user_id": "user_002",
    "story_date": "2025-01-15",
    "story_count": 7
  },
  {
    "user_id": "user_002",
    "story_date": "2025-06-29",
    "story_count": 10
  },
  {
    "user_id": "user_003",
    "story_date": "2024-07-10",
    "story_count": 2
  },
  {
    "user_id": "user_003",
    "story_date": "2024-08-20",
    "story_count": 8
  },
  {
    "user_id": "user_003",
    "story_date": "2024-08-20",
    "story_count": 8
  },
  {
    "user_id": "user_003",
    "story_date": "2025-03-11",
    "story_count": 5
  },
  {
    "user_id": null,
    "story_date": "2025-03-12",
    "story_count": 3
  },
  {
    "user_id": "USER_003",
    "story_date": "2025-04-01",
    "story_count": 4
  },
  {
    "user_id": "user_004",
    "story_date": "2024-07-15",
    "story_count": 6
  },
  {
    "user_id": "user_004",
    "story_date": "2024-09-30",
    "story_count": 7
  },
  {
    "user_id": "user_004",
    "story_date": "2024/10/10",
    "story_count": 4
  },
  {
    "user_id": "user_004",
    "story_date": "2024-11-11",
    "story_count": 3
  },
  {
    "user_id": "user_004",
    "story_date": "2025-02-28",
    "story_count": 12
  },
  {
    "user_id": "user_004",
    "story_date": "2025-03-01",
    "story_count": 0
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-01",
    "story_count": 1
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-02",
    "story_count": 2
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-03",
    "story_count": 3
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-04",
    "story_count": 4
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-05",
    "story_count": null
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-06",
    "story_count": 5
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-01",
    "story_count": 9
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-02",
    "story_count": 10
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-03",
    "story_count": 9
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-04",
    "story_count": 50
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-05",
    "story_count": 8
  },
  {
    "user_id": "user_006",
    "story_date": null,
    "story_count": 7
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-10",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-11",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-12",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-13",
    "story_count": 3
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-14",
    "story_count": 2
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-15",
    "story_count": 1
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-01",
    "story_count": 11
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-02",
    "story_count": 12
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-03",
    "story_count": 13
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-04",
    "story_count": 14
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-05",
    "story_count": 15
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-06",
    "story_count": 0
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-01",
    "story_count": 1
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-02",
    "story_count": 2
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-03",
    "story_count": 3
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-04",
    "story_count": 4
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-05",
    "story_count": 5
  },
  {
    "user_id": "user_009",
    "story_date": "invalid_date",
    "story_count": 6
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-15",
    "story_count": 7
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-16",
    "story_count": 8
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-17",
    "story_count": 9
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-18",
    "story_count": 10
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-19",
    "story_count": 11
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-20",
    "story_count": 12
  }
]
stories_data = pd.DataFrame(stories_data_data)


## Question 1

Take a look at the data in the story_date column. Correct any data type inconsistencies in that column.

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: stories_data
# Please print your final result or dataframe

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 1 of 3 
# Take a look at the data in the `story_date column`. Correct any data type inconsistencies in that column.

# Load the CSV file into a DataFrame and display it
stories_df = stories_data.copy()
print(stories_df)
print("=" * 150)
print()

# We can see that there are a total of 60 rows and all three columns have missing values.
# But first lets change the story_date column to datetime format
stories_df['story_date'] = pd.to_datetime(stories_df['story_date'], format='%Y-%m-%d')
print(stories_df.info())
print("=" * 150)
print()

# Answer to Question 1: The number of missing values on "story_date" column of the data set is:
sd_missing_values = stories_df["story_date"].isnull().sum()
print('The number of missing values on "story_date" column of the data set is:', sd_missing_values)

## Question 2

Calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day.

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: stories_data
# Please print your final result or dataframe

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 1 of 3 
# Take a look at the data in the `story_date column`. Correct any data type inconsistencies in that column.

# Load the CSV file into a DataFrame and display it
stories_df = stories_data.copy()
print(stories_df)
print("=" * 150)
print()

# We can see that there are a total of 60 rows and all three columns have missing values.
# But first lets change the story_date column to datetime format
stories_df['story_date'] = pd.to_datetime(stories_df['story_date'], format='%Y-%m-%d')
print(stories_df.info())
print("=" * 150)
print()

# Answer to Question 1: The number of missing values on "story_date" column of the data set is:
sd_missing_values = stories_df["story_date"].isnull().sum()
print('The number of missing values on "story_date" column of the data set is:', sd_missing_values)

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 2 of 3
# Calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day.

# Printing the dataframe to see the data
print(stories_df)

# Normalizing and cleaning the data
stories_df['user_id'] = stories_df['user_id'].str.lower()
stories_df['user_id'] = stories_df['user_id'].str.lower().str.strip()
print(stories_df['user_id'].unique())

# Keep rows we can measure on (drop missing user_id or date for this metric)
clean = stories_df.dropna(subset=['user_id', 'story_date']).copy()

# Make sure story_count is numeric (and treat NaN as 0 stories)
clean['story_count'] = pd.to_numeric(clean['story_count'], errors='coerce').fillna(0)
print(clean.info())
print(clean)

# We can start by doing a groupby operation on user_id and story_date to count the number of stories created by each user on each day
stories_per_user_per_day = clean.groupby(['user_id', 'story_date']).agg(total_story_count = ('story_count', 'sum')).reset_index().sort_values(by=['user_id', 'story_date'], ascending=[True, True])
print(stories_per_user_per_day)

# Now we can calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day
percentiles = stories_per_user_per_day['total_story_count'].quantile([0.25, 0.5, 0.75])
percentiles.index = ['25th', '50th', '75th']
print("\nThe 25th, 50th, and 75th percentiles of the number of stories created per user per day are:")
print(percentiles)

per_user_percentiles = (
    stories_per_user_per_day
    .groupby('user_id')['total_story_count']
    .quantile([0.25, 0.5, 0.75])
    .unstack()              # columns: 0.25, 0.5, 0.75
    .rename(columns={0.25:'p25', 0.5:'p50', 0.75:'p75'})
    .reset_index()
)
per_user_percentiles.head()

## Question 3

What percentage of users have had at least one day, where they posted more than 10 stories on that day?

In [None]:
# Note: pandas and numpy are already imported as pd and np
# The following tables are loaded as pandas DataFrames with the same names: stories_data
# Please print your final result or dataframe

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 1 of 3 
# Take a look at the data in the `story_date column`. Correct any data type inconsistencies in that column.

# Load the CSV file into a DataFrame and display it
stories_df = stories_data.copy()
print(stories_df)
print("=" * 150)
print()

# We can see that there are a total of 60 rows and all three columns have missing values.
# But first lets change the story_date column to datetime format
stories_df['story_date'] = pd.to_datetime(stories_df['story_date'], format='%Y-%m-%d')
print(stories_df.info())
print("=" * 150)
print()

# Answer to Question 1: The number of missing values on "story_date" column of the data set is:
sd_missing_values = stories_df["story_date"].isnull().sum()
print('The number of missing values on "story_date" column of the data set is:', sd_missing_values)

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 2 of 3
# Calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day.

# Printing the dataframe to see the data
print(stories_df)
print("=" * 150)
print()

# Normalizing and cleaning the data
stories_df['user_id'] = stories_df['user_id'].str.lower()
stories_df['user_id'] = stories_df['user_id'].str.lower().str.strip()
print(stories_df['user_id'].unique())
print("=" * 150)
print()

# Keep rows we can measure on (drop missing user_id or date for this metric)
clean = stories_df.dropna(subset=['user_id', 'story_date']).copy()

# Make sure story_count is numeric (and treat NaN as 0 stories)
clean['story_count'] = pd.to_numeric(clean['story_count'], errors='coerce').fillna(0)
print(clean.info())
print()
print(clean)
print("=" * 150)
print()

# We can start by doing a groupby operation on user_id and story_date to count the number of stories created by each user on each day
stories_per_user_per_day = clean.groupby(['user_id', 'story_date']).agg(total_story_count = ('story_count', 'sum')).reset_index().sort_values(by=['user_id', 'story_date'], ascending=[True, True])
print(stories_per_user_per_day)
print("=" * 150)
print()

# Now we can calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day
percentiles = stories_per_user_per_day['total_story_count'].quantile([0.25, 0.5, 0.75])
percentiles.index = ['25th', '50th', '75th']
print("\nThe 25th, 50th, and 75th percentiles of the number of stories created per user per day are:")
print(percentiles)
print("=" * 150)
print()

per_user_percentiles = (
    stories_per_user_per_day
    .groupby('user_id')['total_story_count']
    .quantile([0.25, 0.5, 0.75])
    .unstack()              # columns: 0.25, 0.5, 0.75
    .rename(columns={0.25:'p25', 0.5:'p50', 0.75:'p75'})
    .reset_index()
)
print(per_user_percentiles.head())
print("=" * 150)
print()

################################################################################
print()
print("=" * 150)
print("=" * 150)
print()
################################################################################
# Question 3 of 3

# Display the dataframe to see the data again
print(stories_per_user_per_day)
print("=" * 150)
print()

# Here we need to first group by user_id and total_story_count to find users who have had at least one day where they posted more than 10 stories on that day
users_with_more_than_10_stories = stories_per_user_per_day[stories_per_user_per_day['total_story_count'] > 10]['user_id'].nunique()
print('The number of users who have had at least one day where they posted more than 10 stories on that day is:', users_with_more_than_10_stories)
print()

# Now we can calculate the percentage of users who have had at least one day where they posted more than 10 stories on that day
total_users = stories_per_user_per_day['user_id'].nunique()
percentage = (users_with_more_than_10_stories / total_users) * 100
print(f"\nThe percentage of users who have had at least one day where they posted more than 10 stories on that day is: {percentage:.2f}%")
print("=" * 150)
print()

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)