In [None]:
"""Scrape data from NomadList into local CSV.
Usage:
    python scrape.py --cities Austin 'Chiang Mai' Taipei Auckland Ubud 'Buenos Aires' 'Mexico City'
"""
import argparse
import logging
import os
import re
import string
import typing as T
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tabulate import tabulate
from tqdm import tqdm
import requests
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


def load_to_record(city):
    clean_city = re.sub(r"\s+", "-", city.lower())
    url = f"https://nomadlist.com/{clean_city}"

    # driver = webdriver.Firefox()

    # driver.get(url)

    # html_source = driver.page_source

    # driver.close()
    # Make an HTTP GET request to the URL
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")


    # soup = BeautifulSoup(html_source, "html.parser")
    nomad_scores = soup.find_all("div", attrs={"class": "tab-ranking"})[0]
    keys = list(
        map(lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "key"}))
    )
    values = list(
        map(
            lambda e: e.getText(), nomad_scores.find_all("td", attrs={"class": "value"})
        )
    )

    record = dict(zip(keys, values))
    record["city"] = city
    return record


def load_to_df(cities):
    def skip_fail(city):
        try:
            print("FETCHING")
            return load_to_record(city)
        except Exception as exc:
            logger.exception(f"Failed to fetch city: {city}, {exc}")
            return None

    records = list(filter(None, map(skip_fail, cities)))

    df = pd.DataFrame.from_dict(records)

    def strip_emojis(s):
        return "".join(filter(lambda x: x in string.printable, s)).strip().replace("*", "")

    cols = map(lambda col: strip_emojis(col), df.columns)
    df.columns = cols

    top_cols = [
      'city',
      'Total score',
      'Cost',
      'Quality of life score',
      'Internet',
      'Safety',
      'Fun',
      'Walkability',
      'Nightlife',
      'Friendly to foreigners',
      'Freedom of speech',
      'English speaking',
      'Food safety',
      'Places to work from'
    ]


    df = df[top_cols]

    def extract_numeric(value):
      try:
          if isinstance(value, str):
              # Extract first numeric value including decimals
              match_ = re.search(r"[-+]?\d*\.\d+|\d+", value.replace(",", ""))
              if match_:
                  return float(match_.group())
          return None
      except Exception as e:
          print(f"Error processing value: {value} - {e}")
          return None


    df["Total score"] = df["Total score"].apply(lambda x: extract_numeric(x.split("/")[0]))
    df["Cost"] = df["Cost"].apply(lambda x: extract_numeric(x))
    df["Internet"] = df["Internet"].apply(lambda x: extract_numeric(x))
    skip_list = ["Total score", "Cost", "Internet", 'city']

    # Map descriptive ratings to numeric values
    rating_map = {
        "Great": 5,
        "Good": 4,
        "Okay": 3,
        "Bad": 2,
        "Terrible": 1,
        None: 0,
    }

    for col in top_cols:
        if col in skip_list:
            continue
        df[col] = df[col].map(rating_map)

    # Display the transformed DataFrame
    return df.sort_values(
        by=["Total score"],
        ascending=False,
    )


def get_parser():
    parser = argparse.ArgumentParser(
        description="Fetch data from NomadList and write as CSV"
    )
    parser.add_argument("--cities", nargs="+", help="Cities to fetch data on")
    return parser


def main(cities=T.List[str]):
    cache_file = "nomadlist.csv"

    logger.info(f"Fetching contents for first time '{cache_file}'")
    df = load_to_df(cities)
    df.to_csv(cache_file, index=False)

    print(tabulate(df, headers="keys", tablefmt="psql"))


asean_cities = [
    "Jakarta",
    "Bangkok",
    "Ho-Chi-Minh-City",
    "Hanoi",
    "Manila",
    "Singapore",
    "Kuala-Lumpur",
    "Yangon",
    "Surabaya",
    "Bandung",
    "Medan",
    "Phnom-Penh",
    "Chiang-Mai",
    "Davao",
    "Cebu",
    "Vientiane",
    "Naypyidaw",
    "Makassar",
    "Pattaya",
    "George-Town",
    "Da-Nang",
    "Ubud",
    "Baguio",
    "Ipoh",
]


main(asean_cities)

INFO:__main__:Fetching contents for first time 'nomadlist.csv'


FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
FETCHING
+----+------------------+---------------+--------+-------------------------+------------+----------+-------+---------------+-------------+--------------------------+---------------------+--------------------+---------------+-----------------------+
|    | city             |   Total score |   Cost |   Quality of life score |   Internet |   Safety |   Fun |   Walkability |   Nightlife |   Friendly to foreigners |   Freedom of speech |   English speaking |   Food safety |   Places to work from |
|----+------------------+---------------+--------+-------------------------+------------+----------+-------+---------------+-------------+--------------------------+---------------------+--------------------+---------------+-----------------------|
|  1 | Bangkok          |          4.