In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def get_econ(url_string):
    # get wikipedia page
    response = requests.get(
        url=url_string,
    )
    # init bs object
    soup = BeautifulSoup(response.content, 'html.parser')
    countries = []
    first = 0


    if ('developed' in url_string.lower()):
        table = soup.find('table',{'class':'wikitable'}).find_all('a')
        first = 9
    if ('emerging' in url_string.lower()):
        table = soup.find('table',{'class':'wikitable sortable'}).find_all('a')
        first = 16
    if ('frontier' in url_string.lower()):
        table = soup.find('table',{'class':'sortable wikitable'}).find_all('a')
        first = 16
        
    for link in table:
        countries.append(link.get('title'))

    
    return countries[first:]

In [3]:
pages = ["https://en.wikipedia.org/wiki/Developed_market",
        "https://en.wikipedia.org/wiki/Emerging_market",
    "https://en.wikipedia.org/wiki/Frontier_markets",
    # "least":"https://en.wikipedia.org/wiki/Least_developed_countries" # no countries in this so we don't scrap
        ]

In [4]:
developed = pd.DataFrame(get_econ(pages[0]), columns=['country']).dropna()
developed.insert(1, 'economic_class', 1)

In [5]:
emerging = pd.DataFrame(get_econ(pages[1]), columns=['country'])
emerging.insert(1, 'economic_class', 2)

In [6]:
frontier = pd.DataFrame(get_econ(pages[2]), columns=['country']).dropna()
frontier.insert(1, 'economic_class', 3)

In [7]:
total = pd.concat([developed, emerging, frontier])

In [8]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108 entries, 0 to 46
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   country         108 non-null    object
 1   economic_class  108 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.5+ KB


In [9]:
billionaires = pd.read_csv('data/old/billionaires_subset.csv', parse_dates=['birthDate'])

In [10]:
billionaires.head()

Unnamed: 0,rank,finalWorth,category,personName,age,country,source,industries,status,gender,birthDate,gdp_country,life_expectancy_country,total_tax_rate_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,LVMH,Fashion & Retail,U,M,1949-03-05,"$2,715,518,274,227.00",82.5,60.7
1,2,180000,Automotive,Elon Musk,51.0,United States,"Tesla, SpaceX",Automotive,D,M,1971-06-28,"$21,427,700,000,000.00",78.5,36.6
2,3,114000,Technology,Jeff Bezos,59.0,United States,Amazon,Technology,D,M,1964-01-12,"$21,427,700,000,000.00",78.5,36.6
3,4,107000,Technology,Larry Ellison,78.0,United States,Oracle,Technology,U,M,1944-08-17,"$21,427,700,000,000.00",78.5,36.6
4,5,106000,Finance & Investments,Warren Buffett,92.0,United States,Berkshire Hathaway,Finance & Investments,D,M,1930-08-30,"$21,427,700,000,000.00",78.5,36.6


In [11]:
df = pd.merge(billionaires, total, left_on='country', right_on='country')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   rank                     2652 non-null   int64         
 1   finalWorth               2652 non-null   int64         
 2   category                 2652 non-null   object        
 3   personName               2652 non-null   object        
 4   age                      2599 non-null   float64       
 5   country                  2652 non-null   object        
 6   source                   2652 non-null   object        
 7   industries               2652 non-null   object        
 8   status                   2652 non-null   object        
 9   gender                   2652 non-null   object        
 10  birthDate                2588 non-null   datetime64[ns]
 11  gdp_country              2541 non-null   object        
 12  life_expectancy_country  2541 non-

In [13]:
edu = pd.read_csv('data/old/edu_numbered.csv')

In [14]:
df = pd.merge(df, edu, how='left', left_on='personName', right_on='Name')
df = df.rename(columns={'Education': 'education'})
# not dropping NA values for now

In [15]:
df = df.drop(columns=['rank', 'Name'])

In [16]:
# df['Education'] = df['Education'].str.split(' ').str[0]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2657 entries, 0 to 2656
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   finalWorth               2657 non-null   int64         
 1   category                 2657 non-null   object        
 2   personName               2657 non-null   object        
 3   age                      2604 non-null   float64       
 4   country                  2657 non-null   object        
 5   source                   2657 non-null   object        
 6   industries               2657 non-null   object        
 7   status                   2657 non-null   object        
 8   gender                   2657 non-null   object        
 9   birthDate                2593 non-null   datetime64[ns]
 10  gdp_country              2545 non-null   object        
 11  life_expectancy_country  2545 non-null   float64       
 12  total_tax_rate_country   2545 non-

In [18]:
df.head(3)

Unnamed: 0,finalWorth,category,personName,age,country,source,industries,status,gender,birthDate,gdp_country,life_expectancy_country,total_tax_rate_country,economic_class,education
0,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,LVMH,Fashion & Retail,U,M,1949-03-05,"$2,715,518,274,227.00",82.5,60.7,1,3.0
1,80500,Fashion & Retail,Francoise Bettencourt Meyers & family,69.0,France,L'Oréal,Fashion & Retail,U,F,1953-07-10,"$2,715,518,274,227.00",82.5,60.7,1,
2,40100,Fashion & Retail,François Pinault & family,86.0,France,Luxury goods,Fashion & Retail,D,M,1936-08-21,"$2,715,518,274,227.00",82.5,60.7,1,0.0


In [19]:
df['education'].unique()

array(['3', nan, '0', '2', '4', '1', '5'], dtype=object)

In [20]:
df.to_csv('billionaires.csv', encoding='utf-8',index=False)