In [118]:

import time
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By

In [120]:
country_dict = {
    "United States of America": "US",
    "Australia": "AU",
    "Canada": "CA",
    "China": "CN",
    "France": "FR",
    "Germany": "DE",
    "United Kingdom": "GB",
    "Italy": "IT",
    "Japan": "JP",
    "South Korea": "KR",
    # "Russia": "RU",
    # "Algeria": "DZ",
    # "Angola": "AO",
    # "Argentina": "AR",
    # "Austria": "AT",
    # "Azerbaijan": "AZ",
    # "Bahrain": "BH",
    # "Barbados": "BB",
    # "Belarus": "BY",
    # "Belgium": "BE",
    # "Bermuda": "BM",
    # "Brazil": "BR",
    # "Bulgaria": "BG",
    # "Chile": "CL",
    # "Colombia": "CO",
    # "Costa Rica": "CR",
    # "Croatia": "HR",
    # "Cyprus": "CY",
    # "Czech Republic": "CZ",
    # "Denmark": "DK",
    # "Dominican Republic": "DO",
    # "Ecuador": "EC",
    # "Egypt": "EG",
    # "El Salvador": "SV",
    # "Finland": "FI",
    # "Ghana": "GH",
    # "Greece": "GR",
    # "Guatemala": "GT",
    # "Hong Kong": "HK",
    # "Hungary": "HU",
    # "India": "IN",
    # "Indonesia": "ID",
    # "Ireland": "IE",
    # "Israel": "IL",
    # "Kazakhstan": "KZ",
    # "Kenya": "KE",
    # "Kuwait": "KW",
    # "Lebanon": "LB",
    # "Lithuania": "LT",
    # "Luxembourg": "LU",
    # "Macau": "MO",
    # "Madagascar": "MG",
    # "Malaysia": "MY",
    # "Malta": "MT",
    # "Mexico": "MX",
    # "Netherlands": "NL",
    # "New Zealand": "NZ",
    # "Nigeria": "NG",
    # "Norway": "NO",
    # "Oman": "OM",
    # "Pakistan": "PK",
    # "Panama": "PA",
    # "Peru": "PE",
    # "Philippines": "PH",
    # "Poland": "PL",
    # "Portugal": "PT",
    # "Qatar": "QA",
    # "Romania": "RO",
    # "Saudi Arabia": "SA",
    # "Serbia": "RS",
    # "Singapore": "SG",
    # "Slovakia": "SK",
    # "Slovenia": "SI",
    # "South Africa": "ZA",
    # "Spain": "ES",
    # "Sri Lanka": "LK",
    # "Sweden": "SE",
    # "Switzerland": "CH",
    # "Taiwan": "TW",
    # "Thailand": "TH",
    # "Tunisia": "TN",
    # "Turkey": "TR",
    # "Ukraine": "UA",
    # "United Arab Emirates": "AE",
    # "Uruguay": "UY",
    # "Uzbekistan": "UZ",
    # "Venezuela": "VE",
    # "Vietnam": "VN",
    # "Bolivia": "BO",
    # "Cambodia": "KH",
    # "Estonia": "EE",
    # "Latvia": "LV",
    # "Nicaragua": "NI",
    # "Paraguay": "PY",
    # "Afghanistan": "AF",
    # "Georgia": "GE",
    # "Iraq": "IQ",
    # "Libya": "LY",
    # "Morocco": "MA",
    # "Mozambique": "MZ",
    # "Myanmar": "MM",
    # "Yemen": "YE"
}


dates = ['2024-12-30', '2024-12-31']

categories = {
    'Entertainment': '6016',
    'Education': '6017'
}

base_url = 'https://app.sensortower.com/top-charts?os=ios&device=iphone&'
columns = ["Rank", "Country", "Name", "Publisher", "Category", "Rating", "Price", "Rating_count", "Date"]
neumeric_columns = ["Rank", "Rating", "Price", "Number_of_Ratings"]


In [121]:
def extract_price_value(price_string):
    # Regular expression to match integers or floats
    match = re.search(r'\b\d+(\.\d+)?\b', price_string)
    if match:
        return float(match.group())  # Convert to float
    return 0

def extract_rating_value(rating_string):
    match = re.search(r"(\d+\.\d+)", rating_string)
    if match:
        return float(match.group(1))
    return 0

In [122]:
def extract_data_for_country(country, category, date):
    url = base_url + f'country={country_dict[country]}&category={categories[category]}&date={date}'
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    time.sleep(1)
    rows = driver.find_element(By.TAG_NAME, 'table').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
    records = []
    for idx, row in enumerate(rows):
        data = row.find_elements(By.TAG_NAME, 'td')
        paid_app_data = data[2].find_element(By.TAG_NAME, 'span').find_element(By.TAG_NAME, 'div').find_element(By.TAG_NAME, 'div')
        data = paid_app_data.text.split('\n')

        rating_data = paid_app_data.find_elements(By.TAG_NAME, 'div')[4].find_element(By.TAG_NAME, 'span').get_attribute('aria-label')
        ls = [idx + 1, country, data[0], data[1], category, extract_rating_value(rating_string=rating_data), extract_price_value(data[4]), int(data[2].strip("()").replace(",", "")), date]
        record = dict(zip(columns, ls))
        records.append(record)
    driver.quit()
    return records

In [123]:
data = []
for country in country_dict.keys():
    for category in categories.keys():
        for date in dates:
            data.extend(extract_data_for_country(country=country, category=category, date=date))
# extract_data_for_country(country='Algeria', category=list(categories.keys())[-1], date=dates[0])

In [124]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Rank,Country,Name,Publisher,Category,Rating,Price,Rating_count,Date
0,1,United States of America,Merge Watermelon for watch,活鹏 麦,Entertainment,3.43,0.99,14,2024-12-30
1,2,United States of America,Countdown App,Ryan Boyling,Entertainment,3.05,0.99,35317,2024-12-30
2,3,United States of America,Strange Planet Stickers,"KED, LLC",Entertainment,4.5,1.99,287,2024-12-30
3,4,United States of America,Pocket God,Bolt Creative,Entertainment,4.28,0.99,2634,2024-12-30
4,5,United States of America,Number Puzzle Games 4 Watch,长金 胡,Entertainment,0.0,0.99,0,2024-12-30


In [126]:
df.to_csv('1_10.csv', index=True)