<a href="https://colab.research.google.com/github/VidhiyaSB/MrBeast-Data-Scraping/blob/main/Mr_Beast_Youtube_Channel_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime, timedelta
import time

# Replace with your actual API key
API_KEY = 'your api key'
CHANNEL_ID = 'UCX6OQ3DkcsbYNE6H8uQQuVA'  # MrBeast's channel ID

youtube = build('youtube', 'v3', developerKey=API_KEY)

def get_channel_stats(youtube, channel_id):
    request = youtube.channels().list(
        part='snippet,contentDetails,statistics',
        id=channel_id
    )
    response = request.execute()

    data = dict(
        Channel_name = response['items'][0]['snippet']['title'],
        Subscribers = response['items'][0]['statistics']['subscriberCount'],
        Views = response['items'][0]['statistics']['viewCount'],
        Total_videos = response['items'][0]['statistics']['videoCount'],
        playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    )
    return data

def get_video_ids(youtube, playlist_id):
    video_ids = []
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=50
    )
    response = request.execute()

    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])

    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')

    return video_ids

def get_video_details(youtube, video_ids):
    all_video_stats = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            video_stats = dict(
                Title = video['snippet']['title'],
                Published_date = video['snippet']['publishedAt'],
                Views = video['statistics']['viewCount'],
                Likes = video['statistics'].get('likeCount', 0),
                Comments = video['statistics'].get('commentCount', 0)
            )
            all_video_stats.append(video_stats)

    return all_video_stats

# Main execution
channel_stats = get_channel_stats(youtube, CHANNEL_ID)
playlist_id = channel_stats['playlist_id']
video_ids = get_video_ids(youtube, playlist_id)
video_details = get_video_details(youtube, video_ids)

# Convert to DataFrame and save as CSV
videos_df = pd.DataFrame(video_details)
videos_df['Published_date'] = pd.to_datetime(videos_df['Published_date'])
videos_df['Views'] = pd.to_numeric(videos_df['Views'])
videos_df['Likes'] = pd.to_numeric(videos_df['Likes'])
videos_df['Comments'] = pd.to_numeric(videos_df['Comments'])

videos_df.to_csv('mrbeast_videos.csv', index=False)
print(f"Data collection complete. {len(videos_df)} videos data saved to mrbeast_videos.csv")

Data collection complete. 818 videos data saved to mrbeast_videos.csv


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone

# Load the data
df = pd.read_csv('mrbeast_videos.csv')

# Handle missing values
df['Likes'].fillna(0, inplace=True)
df['Comments'].fillna(0, inplace=True)

# Convert data types
df['Published_date'] = pd.to_datetime(df['Published_date'], utc=True)
df['Views'] = pd.to_numeric(df['Views'])
df['Likes'] = pd.to_numeric(df['Likes'])
df['Comments'] = pd.to_numeric(df['Comments'])

# Handle outliers (using IQR method for Views)
Q1 = df['Views'].quantile(0.25)
Q3 = df['Views'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['Views'] = df['Views'].clip(lower_bound, upper_bound)

# Create derived features
df['Engagement_Rate'] = (df['Likes'] + df['Comments']) / df['Views'] * 100

# Use timezone-aware datetime for calculation
current_time = datetime.now(timezone.utc)
df['Days_Since_Published'] = (current_time - df['Published_date']).dt.total_seconds() / (24 * 60 * 60)

df['Month'] = df['Published_date'].dt.to_period('M')
df['Day_of_Week'] = df['Published_date'].dt.day_name()

# Calculate daily view average
df['Daily_Views'] = df['Views'] / df['Days_Since_Published']

print(df.head())
print(df.info())

# Save the cleaned data
df.to_csv('mrbeast_videos_cleaned.csv', index=False)

                                              Title            Published_date  \
0                  Spot The Difference, Win $10,000 2024-09-28 20:00:00+00:00   
1            100 Identical Twins Fight For $250,000 2024-09-28 16:00:00+00:00   
2           Running With Bigger And Bigger Lunchlys 2024-09-20 16:00:00+00:00   
3                    Holding Bigger And Bigger Dogs 2024-09-16 18:00:00+00:00   
4  Men Vs Women Survive The Wilderness For $500,000 2024-09-07 16:00:00+00:00   

         Views    Likes  Comments  Engagement_Rate  Days_Since_Published  \
0   11133780.0   787781      1369         7.087889              0.744579   
1   36758597.0  1562997     11432         4.283159              0.911246   
2  108667534.0  4001712      6698         3.688691              8.911246   
3   55172243.0  3142209      4774         5.703924             12.827913   
4  108936306.0  3438183     49326         3.201420             21.911246   

     Month Day_of_Week   Daily_Views  
0  2024-09    Sat

  df['Month'] = df['Published_date'].dt.to_period('M')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned data
df = pd.read_csv('mrbeast_videos_cleaned.csv')
df['Published_date'] = pd.to_datetime(df['Published_date'])

# Basic statistical analysis
print(df.describe())

# Create visualizations

# 1. View count distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Views'], kde=True)
plt.title('Distribution of Video Views')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.savefig('view_distribution.png')
plt.close()

# 2. Engagement rate over time
plt.figure(figsize=(12, 6))
plt.scatter(df['Published_date'], df['Engagement_Rate'])
plt.title('Engagement Rate Over Time')
plt.xlabel('Published Date')
plt.ylabel('Engagement Rate (%)')
plt.savefig('engagement_over_time.png')
plt.close()

# 3. Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df[['Views', 'Likes', 'Comments', 'Engagement_Rate', 'Days_Since_Published']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# 4. Average views by day of week
plt.figure(figsize=(10, 6))
df.groupby('Day_of_Week')['Views'].mean().sort_values().plot(kind='bar')
plt.title('Average Views by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Views')
plt.savefig('views_by_day.png')
plt.close()

# 5. View count vs. likes scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['Views'], df['Likes'])
plt.title('Views vs. Likes')
plt.xlabel('Views')
plt.ylabel('Likes')
plt.savefig('views_vs_likes.png')
plt.close()

print("EDA complete. Visualizations saved as PNG files.")

              Views         Likes       Comments  Engagement_Rate  \
count  8.180000e+02  8.180000e+02     818.000000       818.000000   
mean   6.345580e+07  2.267952e+06   38667.540342         3.543297   
std    9.605036e+07  4.959554e+06   70573.400869         1.938237   
min    5.163000e+04  0.000000e+00       0.000000         0.175455   
25%    9.971925e+04  2.851750e+03     423.250000         2.394729   
50%    1.024188e+06  3.456100e+04    3067.000000         3.168308   
75%    1.173619e+08  2.713259e+06   55490.500000         4.213561   
max    2.932551e+08  5.202004e+07  781244.000000        27.365657   

       Days_Since_Published   Daily_Views  
count            818.000000  8.180000e+02  
mean            2607.823988  2.503865e+05  
std             1187.805847  1.666443e+06  
min                0.744579  1.349819e+01  
25%             1865.994519  2.747771e+01  
50%             3078.116645  2.945704e+02  
75%             3417.292522  6.885095e+04  
max             4604.63170