# Import Necessary Modules

In [None]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def clean_podcast_data(df):
    df['endTime'] = pd.to_datetime(df['endTime'], errors='coerce')

    df['date'] = df['endTime'].dt.date

    df['hour'] = df['endTime'].dt.hour

    df['minutesPlayed'] = df['msPlayed'] / 60000

    df = df.dropna(subset=['endTime'])
    
    df = df[df['minutesPlayed'] > 0]
    
    df = df.drop_duplicates()
    
    return df

In [None]:
podcast_file_path = 'datasets/StreamingHistory_podcast_0.json'
with open(podcast_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

df = pd.DataFrame(data)

df.head(7)


In [None]:
df = clean_podcast_data(df)

df.head(7)

# General Statistics

In [None]:
total_time = df['msPlayed'].sum() / (1000 * 60)
total_podcasts = df['podcastName'].nunique()
total_episodes = len(df)
avg_ep_length = (df['msPlayed'] / 60000).mean()

print(f"Total listening time: {total_time:.0f} minutes ({total_time/60:.1f} hours)")
print(f"Number of different podcasts listened to: {total_podcasts}")
print(f"Total episodes listened: {total_episodes}")
print(f"Average episode length: {avg_ep_length:.1f} minutes")

In [None]:
dates = sorted(df['date'].unique()) 
max_streak = current_streak = 0
for i in range(1, len(dates)):
    if (dates[i] - dates[i-1]).days == 1:
        current_streak += 1
        max_streak = max(max_streak, current_streak)
    else:
        current_streak = 0
        
start_date = df['date'].min()
end_date = df['date'].max()
date_range = (end_date - start_date).days + 1

print(f"Podcast Listening Statistics (from {start_date} to {end_date}):")
print(f"Longest streak of consecutive days listening to podcasts: {max_streak+1} days")

# Most Frequently Listened Podcasts

In [None]:
podcast_time_played = df.groupby('podcastName')['msPlayed'].sum().sort_values(ascending=False) / 60000 # convert to minutes

podcast_time_played.head(5).plot(kind='bar', figsize=(12, 6))
plt.title('Top 5 Most Listened Podcasts')
plt.xlabel('Podcast Name')
plt.ylabel('Time Listened (minutes)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
print(f"Most listened podcast: {podcast_time_played.index[0]}, at {podcast_time_played.iloc[0]:.2f} minutes")

# Timeline of Listening

In [None]:
df.set_index('endTime', inplace=True)

daily_listening = df.resample('D')['msPlayed'].sum() / (60 * 1000)

monthly_listening = daily_listening.resample('ME').sum()

In [None]:
plt.figure(figsize=(12, 6))
monthly_listening.plot(kind='line')
plt.title('Podcast Listening Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Listening Time (minutes)')
plt.tight_layout()
plt.show()

In [None]:
#Some Statistics
print(f"Total listening time: {monthly_listening.sum():.2f} minutes")
print(f"Average monthly listening time: {monthly_listening.mean():.2f} minutes")
print(f"Month with most listening: {monthly_listening.idxmax().strftime('%Y-%m')}")
print(f"Month with least listening: {monthly_listening.idxmin().strftime('%Y-%m')}")

# What Time of Day Did I Listen the Most? 

In [None]:
# Create a histogram of listening times
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='hour', weights='minutesPlayed', bins=24, kde=True)
plt.title('Podcast Listening Frequency by Time of Day')
plt.xlabel('Hour of Day (24-hour format)')
plt.ylabel('Total Listening Time (minutes)')
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

In [None]:
#Calculate peak time
hour_listening = df.groupby('hour')['minutesPlayed'].sum()
peak_hour = hour_listening.index[hour_listening.argmax()]
peak_time = f"{peak_hour:02d}:00 - {(peak_hour + 1) % 24:02d}:00"

print(f'You most often listen to podcasts between {peak_time}.') 

In [None]:
# Calculate percentage of listening time for each hour
total_time = hour_listening.sum()
hour_percentage = (hour_listening / total_time) * 100

print("\nTop 3 hours for podcast listening:")
for hour, percentage in hour_percentage.nlargest(3).items():
    print(f"{hour:02d}:00 - {(hour + 1) % 24:02d}:00: {percentage:.2f}% ({hour_listening[hour]:.0f} minutes)")

# That's Sufficient for Podcasts Analysis