# Scraping NUMBEO for Cost of Living Data
https://www.numbeo.com/property-investment/

Here is a starter notebook for scraping some cost of living information from NUMBEO.

In [203]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from pprint import pprint
import pickle

df = pd.read_excel('../data/COST OF LIVING.xlsx')
df.head(2)

Unnamed: 0,REGION_ACCOUNT,REGION_URL,AREA_ACCOUNT,AREA_URL
0,I/ MEDAN,Medan,ACEH,Banda-Aceh-Indonesia
1,I/ MEDAN,Medan,BATAM,Batam


### Cost of Living for Each Area
There are only 15 areas availble on the portal

In [212]:
df[~df.AREA_URL.isna()].AREA_ACCOUNT

0            ACEH
1           BATAM
4       PEKANBARU
8          PADANG
9       PALEMBANG
10         BEKASI
11          BOGOR
17      TANGERANG
23     YOGYAKARTA
24       DENPASAR
26         MALANG
29     BALIKPAPAN
30    BANJARMASIN
31      PONTIANAK
33       MAKASSAR
Name: AREA_ACCOUNT, dtype: object

In [215]:
area_dict = {}

for i, row in df[~df.AREA_URL.isna()].iterrows():
    area_url = row['AREA_URL']
    area = row['AREA_ACCOUNT']

    url_string = f'https://www.numbeo.com/property-investment/in/{area_url}'

    init_html_doc = requests.get(url_string)

    soup = BeautifulSoup(init_html_doc.content, 'lxml')
    table = soup.find(attrs={'class':'data_wide_table'})

    if table is None:
        continue
    else:
        area_dict[area] = {}
        for i, row in enumerate(table.findAll('tr')):
            if i >= 0:
                for j, data in enumerate(row.findAll('td')):
                    if j == 0:
                        key = data.text.strip().replace('\xa0','').replace('Rp','')
                    elif j == 1:
                        value = data.text.strip().replace('\xa0','').replace('Rp','')
                        area_dict[area][key] = value
                    else:
                        try:
                            left = data.find(attrs={'class':'barTextLeft'}).text.replace('\n', '')
                        except:
                            left = ''
                        try:
                            right = data.find(attrs={'class':'barTextRight'}).text.replace('\n', '')
                        except:
                            right = ''
                        area_dict[area][key] = [value, (left, right)]

### Cost of Living for Each Region

There are only 8 regions availble on the portal.
If there are no area data available, we can always use the regional data instead.

In [200]:
df[['REGION_ACCOUNT','REGION_URL']].drop_duplicates()

Unnamed: 0,REGION_ACCOUNT,REGION_URL
0,I/ MEDAN,Medan
6,II/ PALEMBANG,Palembang-Indonesia
10,III/ JAKARTA,Jakarta
18,IV/ BANDUNG,Bandung
21,V/ SEMARANG,Semarang
24,VI/ SURABAYA,Surabaya
29,VII/ BANJARMASIN,Banjarmasin-Indonesia
32,VIII/ MAKASSAR,Makassar-Indonesia


In [216]:
region_dict = {}

for i, row in df[['REGION_ACCOUNT','REGION_URL']].drop_duplicates().iterrows():
    region_url = row['REGION_URL']
    region = row['REGION_ACCOUNT']
    
    url_string = f'https://www.numbeo.com/property-investment/in/{region_url}'

    init_html_doc = requests.get(url_string)

    soup = BeautifulSoup(init_html_doc.content, 'lxml')
    table = soup.find(attrs={'class':'data_wide_table'})

    if table is None:
        continue
    else:
        region_dict[region] = {}
        for i, row in enumerate(table.findAll('tr')):
            if i >= 0:
                for j, data in enumerate(row.findAll('td')):
                    if j == 0:
                        key = data.text.strip().replace('\xa0','').replace('Rp','')
                    elif j == 1:
                        value = data.text.strip().replace('\xa0','').replace('Rp','')
                        region_dict[region][key] = value
                    else:
                        try:
                            left = data.find(attrs={'class':'barTextLeft'}).text.replace('\n', '')
                        except:
                            left = ''
                        try:
                            right = data.find(attrs={'class':'barTextRight'}).text.replace('\n', '')
                        except:
                            right = ''
                        region_dict[region][key] = [value, (left, right)]

### Format of the ```area_dict```/```region_dict```

``` json
AREA1: 
    {
    FEATURE1: [MEAN VALUE, (MIN, MAX)],
    FEATURE2: [MEAN VALUE, (MIN, MAX)]
     },
AREA2: 
    {
    FEATURE1: [MEAN VALUE, (MIN, MAX)],
    FEATURE2: [MEAN VALUE, (MIN, MAX)]
     }
```

In [219]:
pprint(region_dict)

{'I/ MEDAN': {'Apartment (1 bedroom) Outside of Centre': ['1,090,000.00',
                                                          ('700,000.00',
                                                           '1,500,000.00')],
              'Apartment (1 bedroom) in City Centre': ['1,800,000.00',
                                                       ('1,000,000.00',
                                                        '3,000,000.00')],
              'Apartment (3 bedrooms) Outside of Centre': ['3,000,000.00',
                                                           ('1,500,000.00',
                                                            '5,000,000.00')],
              'Apartment (3 bedrooms) in City Centre': ['6,875,000.00',
                                                        ('3,000,000.00',
                                                         '10,000,000.00')],
              'Average Monthly Net Salary (After Tax)': ['2,998,000.00',
                                    

In [218]:
pprint(area_dict)

{'ACEH': {'Apartment (1 bedroom) Outside of Centre': ['?', ('', '')],
          'Apartment (1 bedroom) in City Centre': ['?', ('', '')],
          'Apartment (3 bedrooms) Outside of Centre': ['?', ('', '')],
          'Apartment (3 bedrooms) in City Centre': ['?', ('', '')],
          'Average Monthly Net Salary (After Tax)': ['2,950,000.00', ('', '')],
          'Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate': ['?',
                                                                                         ('',
                                                                                          '')],
          'Price per Square Meter to Buy Apartment Outside of Centre': ['?',
                                                                        ('',
                                                                         '')],
          'Price per Square Meter to Buy Apartment in City Centre': ['?',
                                                     

### Saving to pickle files

In [220]:
with open('../data/region_dict.pickle', 'wb') as handle:
    pickle.dump(region_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../data/area_dict.pickle', 'wb') as handle:
    pickle.dump(area_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)