# Scraping Data from kworb
Kworb.net provides us with data on spotify stream count on daily basis, while also allowing us to collect names of thousands of songs we can then use to locate them on spotify or youtube


In [1]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup

Get all the countries available to scrape

In [2]:
# Send a GET request to the URL
url = 'https://kworb.net/spotify/'
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Use the XPath to find the table
table = soup.select_one('body > div.container > div.subcontainer > table')

# Extract all links to daily totals
daily_totals_links = [a['href'] for a in table.find_all('a', href=True) if 'daily_totals' in a['href']]
print(daily_totals_links)

['country/global_daily_totals.html', 'country/us_daily_totals.html', 'country/gb_daily_totals.html', 'country/ad_daily_totals.html', 'country/ar_daily_totals.html', 'country/au_daily_totals.html', 'country/at_daily_totals.html', 'country/by_daily_totals.html', 'country/be_daily_totals.html', 'country/bo_daily_totals.html', 'country/br_daily_totals.html', 'country/bg_daily_totals.html', 'country/ca_daily_totals.html', 'country/cl_daily_totals.html', 'country/co_daily_totals.html', 'country/cr_daily_totals.html', 'country/cy_daily_totals.html', 'country/cz_daily_totals.html', 'country/dk_daily_totals.html', 'country/do_daily_totals.html', 'country/ec_daily_totals.html', 'country/eg_daily_totals.html', 'country/sv_daily_totals.html', 'country/ee_daily_totals.html', 'country/fi_daily_totals.html', 'country/fr_daily_totals.html', 'country/de_daily_totals.html', 'country/gr_daily_totals.html', 'country/gt_daily_totals.html', 'country/hn_daily_totals.html', 'country/hk_daily_totals.html', 'co

In [5]:
# Create the directory if it doesn't exist
os.makedirs('./raw_scrapped_data', exist_ok=True)

# Function to scrape the table and save it
def scrape_and_save_table(link, country):
    country_url = f'https://kworb.net/spotify/{link}'
    country_response = requests.get(country_url)
    country_soup = BeautifulSoup(country_response.content, 'html.parser')
    country_table = country_soup.select_one('body > div.container > div.subcontainer > table')
    # Extract table rows
    if country_table == None:
        return
    rows = country_table.find_all('tr')
    
    # Extract headers
    headers = [header.text.strip() for header in rows[0].find_all('th')]
    
    # Extract data
    data = []
    for row in rows[1:]:
        cols = row.find_all('td')
        data.append([col.text.strip() for col in cols])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    # Save DataFrame to CSV
    path = f'..\..\Data\Raw\{country[:-18]}_daily_totals.csv'
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)


Go through the links and save the data into csv files

In [6]:
for link in daily_totals_links:
    country = link.split('=')[-1]
    scrape_and_save_table(link, country)
    print(f'{country[:-18]} table has been saved successfully!')

country/global table has been saved successfully!


KeyboardInterrupt: 