In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import html


In [13]:
# Step 1: Send a GET request to the "most popular YouTube videos"  page
# Define the URL of the IMDb chart for most popular movies
url = "https://www.statista.com/statistics/249396/top-youtube-videos-views/"
# Define headers to mimic a browser visit and avoid being blocked by the website
headers = {"User-Agent": "Mozilla/5.0"}
# Send a GET request to fetch the content of the webpage
response = requests.get(url, headers=headers)
response

<Response [200]>

In [14]:
if response.status_code == 200: # Condition to check .....
 print("Successfully fetched the page!")
else:
 print(f"Failed to retrieve the page. Status code:{response.status_code}")

Successfully fetched the page!


In [15]:
# Parse the HTML content of the page using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')
soup
# Print the parsed HTML to verify
print(soup.prettify()[:2000]) # method in Beautiful Soup that formats the parsed HTML content into a more readable structure

<!DOCTYPE html>
<html lang="en" prefix="og: http://ogp.me/ns#">
 <head>
  <meta content="same-origin" name="view-transition"/>
  <meta charset="utf-8"/>
  <link crossorigin="" href="https://cdn.statcdn.com" rel="preconnect"/>
  <title>
   Most viewed YouTube videos worldwide 2025| Statista
  </title>
  <meta content="What is the most viewed video on YouTube? &amp;quot;Baby Shark Dance,&amp;quot; published by Pinkfong Kids' Songs &amp; Stories, has over 13 billion views" name="description"/>
  <meta data-page="statistic" id="gtm_routeName"/>
  <meta data-page="false" id="gtm_automatedTest"/>
  <meta data-page="anonymous" id="gtm_userProductGroup"/>
  <meta data-page="31" id="gtm_accountTypeId"/>
  <meta data-page="en" id="gtm_locale"/>
  <meta data-page="statistic" id="gtm_pageType"/>
  <meta data-page="content" id="gtm_userPhase"/>
  <meta data-page="0" id="gtm_userId"/>
  <meta data-page="" id="gtm_userCancelledStatus"/>
  <meta data-page="31" id="gtm_userProductId"/>
  <meta data-pag

In [16]:
# Step 2: Get the <div> with the data-chart-vars attribute
chart_div = soup.find("div", {"data-chart-vars-name": "options"})
raw_json = chart_div['data-chart-vars']

# Step 3: Unescape HTML entities (e.g. &quot; → ")
cleaned_json = html.unescape(raw_json)

# Step 4: Parse JSON
data = json.loads(cleaned_json)

# Step 5: Extract data
titles = data["xAxis"]["categories"]
views = [point["y"] for point in data["series"][0]["data"]]



In [17]:
# Step 6: Create DataFrame
df = pd.DataFrame({
    "Video Title": titles,
    "Views (Billions)": views
})

# Step 7: Display
print(df)

                                         Video Title  Views (Billions)
0  "Baby Shark Dance" - Pinkfong Kids' Songs & St...             15.47
1     "Despacito"- Luis Fonsi featuring Daddy Yankee              8.63
2               "Johny Johny Yes Papa" - LooLoo Kids              6.99
3           "Bath Song" - Cocomelon - Nursery Rhymes              6.96
4   "Wheels on the Bus" - Cocomelon – Nursery Rhymes              6.95
5  See You Again" - Wiz Khalifa featuring Charlie...              6.52
6                       "Shape of You" - Ed Sheeran"              6.39
7          "Phonics Song with Two Words" - ChuChu TV              6.21
8   "Uptown Funk" - Mark Ronson featuring Bruno Mars              5.44
9                               "Gangnam Style" -PSY              5.41


In [18]:
# Step 8: Create a CSV file
# This saves the DataFrame as a CSV file called 'most_viewed_YouTube_videos.csv'
df.to_csv('most_viewed_YouTube_videos.csv', index=False) # index=False means we do not save the index column
print("Data saved to 'most_viewed_YouTube_videos.csv'") # Notify the user that the data has been saved

Data saved to 'most_viewed_YouTube_videos.csv'
