In [9]:
from selenium import webdriver
import re
import pandas as pd
import time
import os

In [10]:
indicators = {
    'NY.GDP.MKTP.CD': 'GDP_current_USD',
    'NY.GDP.PCAP.CD': 'GDP_per_capita',
    'FP.CPI.TOTL.ZG': 'Inflation',
    'SL.UEM.TOTL.ZS': 'Unemployment',
    'SP.POP.TOTL': 'Population',
    'NE.EXP.GNFS.ZS': 'Exports_pct_GDP'
}

countries = {
    'Afghanistan': 'AF',
    'Bangladesh': 'BD',
    'Bhutan': 'BT',
    'India': 'IN',
    'Maldives': 'MV',
    'Nepal': 'NP',
    'Pakistan': 'PK',
    'Sri Lanka': 'LK'
}

# Years to scrape
years_to_scrape = [2018, 2019]

In [29]:
def scrape_multiple_years(indicator_code, country_code, country_name, indicator_name, years):
    """
    Scrape data for multiple years from a single page
    Returns: list of dicts with data
    """
    
    print(f"  {country_name:12} - {indicator_name:20}...", end=" ", flush=True)
    
    driver = None
    
    try:
        driver = webdriver.Safari()
        
        url = f"https://data.worldbank.org/indicator/{indicator_code}?locations={country_code}"
        driver.get(url)
        time.sleep(6)
        
        html = driver.page_source
        
        driver.quit()
        driver = None
        
        results = []
        
        for year in years:
            # Pattern: "YEAR",["^ ","^2","atom","^3",VALUE,"^21",true]
            pattern = f'"{year}",\\["[^"]*","[^"]*","[^"]*","[^"]*",([0-9.eE+\\-]+)'
            
            matches = re.findall(pattern, html)
            
            if matches:
                value = float(matches[0])
                results.append({
                    'country': country_name,
                    'country_code': country_code,
                    'indicator_code': indicator_code,
                    'indicator_name': indicator_name,
                    'year': year,
                    'value': value
                })
        
        # Print results
        if results:
            year_values = [f"{r['year']}:‚úÖ" for r in results]
            print(f"{', '.join(year_values)}")
        else:
            print("‚ö†Ô∏è No data")
        
        return results
            
    except Exception as e:
        print(f"‚ùå Error: {str(e)[:40]}")
        return []
    
    finally:
        if driver is not None:
            try:
                driver.quit()
            except:
                pass
        time.sleep(1)

print("‚úÖ Scraping function defined")

‚úÖ Scraping function defined


# Main Scraping

In [None]:
print("="*80)
print("WORLD BANK 2018-2019 DATA SCRAPER")
print("="*80)
print(f"\n‚è±Ô∏è  Estimated time: ~5 minutes")
print(f"üì¶ Total pages to visit: {len(indicators) * len(countries)}\n")

all_data = []
total = len(indicators) * len(countries)
current = 0

for indicator_code, indicator_name in indicators.items():
    
    print(f"\n{'‚îÄ'*80}")
    print(f"üìà {indicator_name}")
    print('‚îÄ'*80)
    
    for country_name, country_code in countries.items():
        
        current += 1
        print(f"[{current}/{total}] ", end="")
        
        data = scrape_multiple_years(
            indicator_code,
            country_code,
            country_name,
            indicator_name,
            years_to_scrape
        )
        
        if data:
            all_data.extend(data)
        
        time.sleep(2)
    
    time.sleep(3)

print("\n" + "="*80)
print("üéâ SCRAPING COMPLETE!")
print("="*80)

WORLD BANK 2018-2019 DATA SCRAPER

‚è±Ô∏è  Estimated time: ~5 minutes
üì¶ Total pages to visit: 48


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìà GDP_current_USD
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
[1/48]   Afghanistan  - GDP_current_USD     ... 2018:‚úÖ, 2019:‚úÖ
[2/48]   Bangladesh   - GDP_current_USD     ... 2018:‚úÖ, 2019:‚úÖ
[3/48]   Bhutan       - GDP_current_USD     ... 2018:‚úÖ, 2019:‚úÖ
[4/48]   India        - GDP_current_USD     ... 2018:‚úÖ, 2019:‚úÖ
[5/48]   Maldives     - GDP_current_USD     ... 

In [16]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,country,country_code,indicator_code,indicator_name,year,value,source
0,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2018,18053220000.0,World Bank (Scraped)
1,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2019,18799440000.0,World Bank (Scraped)
2,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2018,321362800000.0,World Bank (Scraped)
3,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2019,351231700000.0,World Bank (Scraped)
4,Bhutan,BT,NY.GDP.MKTP.CD,GDP_current_USD,2018,2583336000.0,World Bank (Scraped)


In [18]:
df.shape

(94, 7)

In [19]:
df.describe()

Unnamed: 0,year,value
count,94.0,94.0
mean,2018.5,76552590000.0
std,0.502681,405233700000.0
min,2018.0,-0.1333734
25%,2018.0,5.572619
50%,2018.5,493.9699
75%,2019.0,34660800.0
max,2019.0,2835606000000.0


In [20]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         94 non-null     str    
 1   country_code    94 non-null     str    
 2   indicator_code  94 non-null     str    
 3   indicator_name  94 non-null     str    
 4   year            94 non-null     int64  
 5   value           94 non-null     float64
 6   source          94 non-null     str    
dtypes: float64(1), int64(1), str(5)
memory usage: 5.3 KB


In [22]:
df.isna().sum()

country           0
country_code      0
indicator_code    0
indicator_name    0
year              0
value             0
source            0
dtype: int64

In [23]:
df.dropna()

Unnamed: 0,country,country_code,indicator_code,indicator_name,year,value,source
0,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2018,1.805322e+10,World Bank (Scraped)
1,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2019,1.879944e+10,World Bank (Scraped)
2,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2018,3.213628e+11,World Bank (Scraped)
3,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2019,3.512317e+11,World Bank (Scraped)
4,Bhutan,BT,NY.GDP.MKTP.CD,GDP_current_USD,2018,2.583336e+09,World Bank (Scraped)
...,...,...,...,...,...,...,...
89,Nepal,NP,NE.EXP.GNFS.ZS,Exports_pct_GDP,2019,7.779936e+00,World Bank (Scraped)
90,Pakistan,PK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2018,8.581800e+00,World Bank (Scraped)
91,Pakistan,PK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2019,9.390864e+00,World Bank (Scraped)
92,Sri Lanka,LK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2018,2.144625e+01,World Bank (Scraped)


In [25]:
df.duplicated().sum()

np.int64(0)

In [26]:
df.drop_duplicates()

Unnamed: 0,country,country_code,indicator_code,indicator_name,year,value,source
0,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2018,1.805322e+10,World Bank (Scraped)
1,Afghanistan,AF,NY.GDP.MKTP.CD,GDP_current_USD,2019,1.879944e+10,World Bank (Scraped)
2,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2018,3.213628e+11,World Bank (Scraped)
3,Bangladesh,BD,NY.GDP.MKTP.CD,GDP_current_USD,2019,3.512317e+11,World Bank (Scraped)
4,Bhutan,BT,NY.GDP.MKTP.CD,GDP_current_USD,2018,2.583336e+09,World Bank (Scraped)
...,...,...,...,...,...,...,...
89,Nepal,NP,NE.EXP.GNFS.ZS,Exports_pct_GDP,2019,7.779936e+00,World Bank (Scraped)
90,Pakistan,PK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2018,8.581800e+00,World Bank (Scraped)
91,Pakistan,PK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2019,9.390864e+00,World Bank (Scraped)
92,Sri Lanka,LK,NE.EXP.GNFS.ZS,Exports_pct_GDP,2018,2.144625e+01,World Bank (Scraped)


In [28]:
df.to_csv('../data/processed/south_asia_economic_metric_2018_to_2019.csv')
print('Clean WDI South Asia')

Clean WDI South Asia
