# Data Collection
- download reviews posted since Jan 2020.
- use time sleep in between calls to minimize request failures.

In [3]:
from datetime import datetime
from app_store_scraper import AppStore
import re, pandas as pd, os

In [25]:
domains_apps = {
    "investing": ["acorn:id883324671", "robinhood:id938003185"],
    "mentalhealth": ["calm:id571800810", "headspace:id493145008"],
    "dating": ["tinder:id547702041", "bumble:id930441707"],
    "ridehailing": ["lyft:id529379082", "uber:id368677368"]
}

In [31]:
REVIWES_AFTER_DT = datetime(2020, 1, 1, 0, 0)

def fetch_reviews(app_name, app_id, app_domain):
    all_reviews = []

    try:
        app_info = AppStore(country="us", app_name=app_name, app_id=app_id)
        app_info.review(after=REVIWES_AFTER_DT, sleep=2)
        app_reviews = app_info.reviews
        if len(app_reviews) < 1:
            print("Found no reviews for app: ", app_name)
        else:
            for review in app_reviews:
                review["Domain"] = app_domain
                review["Name"] = app_name
                review["AppId"] = app_id
                all_reviews.append(review)

        if len(all_reviews) > 0:
            all_reviews_df = pd.DataFrame(all_reviews)
            output_dir = f"./data/{app_domain}"

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            output_file = f"{output_dir}/{app_name}.csv"
            all_reviews_df.to_csv(output_file, index=False, header=True)
            print("saved to ", output_file)
            return df

    except ValueError as ve:
        print("ValueError:", app_name)
        print("Error message:", str(ve))
    except Exception as e:
        print("Generic Error:", app_name)
        print("Error message:", str(e))

In [None]:
apps_df = {}

for domain, apps in domains_apps.items():
    for app in apps:
            appnames = app.split(":")
            app_id = int(appnames[1].replace("id", ""))
            df = fetch_reviews(appnames[0], app_id, domain)
            if df is not None:
                apps_df[app] = df
                print("\n\n domain:", domain, " #apps with reviews: ", len(df))        