Install dependencies

In [None]:
!pip install pandas

Import requirements

In [None]:
import pandas as pd
import requests
import io

from datetime import date

Function to log a line to file

In [None]:
def log(line: str):
    with open("data/note2.txt", "a") as file:
        # TODO log date and time
        file.write(line + "\n")

Function to download files with timeout

In [None]:
def do_download(url: str):
    try:
        # timeout if no data (bytes) are send for 5 seconds
        return requests.get(url, timeout=5)
    except requests.exceptions.HTTPError:
        return None
    except requests.exceptions.ConnectionError:
        return None
    except requests.exceptions.Timeout:
        return None
    except requests.exceptions.RequestException:
        return None

Function to convert chart data to pandas dataframe if available

In [None]:
def process_file(country: str, date: str):
    url = "https://spotifycharts.com/regional/{}/daily/{}/download".format(country, date)
    print("Downloading file with region {} with date {}".format(country, date))

    request = do_download(url)
    if request is None:
        log("The server did not send a response for region {} with date {}".format(country, date))
        return pd.DataFrame() # return empty frame

    if request.ok:
        try:
            data = request.content.decode("utf8")
            df = pd.read_csv(io.StringIO(data), skiprows=1)
            df["country"] = country
            df["date"] = date
            for index, row in df.iterrows():
                df.loc[df.index[index], "track_id"] = row["URL"].rsplit('/', 1)[-1]
            return df
        except pd.errors.ParserError:
            log("Unable to parse file for region {} with date {}".format(country, date))
            return pd.DataFrame() # return empty frame
        except AttributeError:
            log("Unable to rsplit url for region {} with date {}".format(country, date))
            return pd.DataFrame() # return empty frame
    else:
        log("Unable to download chart for region {} with date {}".format(country, date))
        return pd.DataFrame() # return empty frame

Define date range and regions of interest

In [None]:
dateFrom = date(2017, 1, 1)
dateTo = date(2020, 11, 25)
regions = ["global", "us", "gb", "ad", "ar", "at", "au", "be", "bg", "bo", "br", "ca", "ch", "cl",
           "co", "cr", "cy", "cz", "de", "dk", "do", "ec", "ee", "es", "fi", "fr", "gr", "gt",
           "hk", "hn", "hu", "id", "ie", "il", "in", "is", "it", "jp", "lt", "lu", "lv", "mx",
           "my", "ni", "nl", "no", "nz", "pa", "pe", "ph", "pl", "pt", "py", "ro", "ru", "se",
           "sg", "sk", "sv", "th", "tr", "tw", "ua", "uy", "vn", "za"]


Main loop

In [None]:
dateRange = pd.date_range(dateFrom, dateTo)

for region in regions:
    bigdata = pd.DataFrame()

    for single_date in dateRange:
        frame = process_file(region, single_date.strftime("%Y-%m-%d"))
        bigdata = pd.concat([bigdata, frame], ignore_index=True, sort=False)

    print("Region {} size {}".format(region, bigdata.shape[0]))
    bigdata.to_csv("data/chart_{}.csv".format(region))