In [4]:
import time
import re
import os
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

country_dict = { "Sweden": "SE" }

dates = ['2024-12-30']

categories = { 'Education': '6017' }
base_url = 'https://app.sensortower.com/top-charts?os=ios&device=iphone&'
columns = ["Rank", "Country", "Name", "Publisher", "Category", "Rating", "Price", "Rating_count", "Date"]

In [5]:
def extract_price_value(price_string):
    # Regular expression to match integers or floats
    match = re.search(r'\b\d+(\.\d+)?\b', price_string)
    if match:
        return float(match.group())  # Convert to float
    return 0

def extract_rating_value(rating_string):
    match = re.search(r"(\d+\.\d+)", rating_string)
    if match:
        return float(match.group(1))
    return 0

In [21]:
def extract_data_for_country(country, category, date):
    url = base_url + f'country={country_dict[country]}&category={categories[category]}&date={date}'
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    time.sleep(3)
    rows = driver.find_element(By.TAG_NAME, 'table').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
    records = []
    for idx, row in enumerate(rows):
        data = row.find_elements(By.TAG_NAME, 'td')
        paid_app_data = data[2].find_element(By.TAG_NAME, 'span').find_element(By.TAG_NAME, 'div').find_element(By.TAG_NAME, 'div')
        data = paid_app_data.text.split('\n')
        
        if len(data) < 5:
            continue

        rating_data = paid_app_data.find_elements(By.TAG_NAME, 'div')[4].find_element(By.TAG_NAME, 'span').get_attribute('aria-label')
        ls = [idx + 1, country, data[0], data[1], category, extract_rating_value(rating_string=rating_data), extract_price_value(data[4]), int(data[2].strip("()").replace(",", "")), date]
        record = dict(zip(columns, ls))
        records.append(record)
    driver.quit()
    return records

In [22]:
data = extract_data_for_country(country="Sweden", category='Education', date=dates[-1])
print(len(data))

[{'Rank': 1, 'Country': 'Sweden', 'Name': 'Ta Körkort', 'Publisher': 'Teoriappar Sverige AB', 'Category': 'Education', 'Rating': 4.78, 'Price': 299.0, 'Rating_count': 28182, 'Date': '2024-12-30'}, {'Rank': 2, 'Country': 'Sweden', 'Name': 'Ta AM-Körkort', 'Publisher': 'Teoriappar Sverige AB', 'Category': 'Education', 'Rating': 4.78, 'Price': 199.0, 'Rating_count': 3108, 'Date': '2024-12-30'}, {'Rank': 3, 'Country': 'Sweden', 'Name': 'Jägarexamen', 'Publisher': 'Boboshi AB', 'Category': 'Education', 'Rating': 4.75, 'Price': 299.0, 'Rating_count': 3974, 'Date': '2024-12-30'}, {'Rank': 4, 'Country': 'Sweden', 'Name': 'Babblarna', 'Publisher': 'Filimundus AB', 'Category': 'Education', 'Rating': 3.14, 'Price': 39.0, 'Rating_count': 51, 'Date': '2024-12-30'}, {'Rank': 5, 'Country': 'Sweden', 'Name': 'iKörkort', 'Publisher': 'Boboshi AB', 'Category': 'Education', 'Rating': 4.77, 'Price': 199.0, 'Rating_count': 3353, 'Date': '2024-12-30'}, {'Rank': 6, 'Country': 'Sweden', 'Name': 'AnkiMobile Fl