In [1]:
# import dependencies
from pathlib import Path
import pandas as pd


In [2]:
# read the forbes_2022_billionaires.csv file into a dataframe
data = Path('Resources/forbes_2022_billionaires.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,year,month,category,source,country,state,...,organization,selfMade,gender,birthDate,title,philanthropyScore,residenceMsa,numberOfSiblings,bio,about
0,1,Elon Musk,50.0,219000,2022,4,Automotive,"Tesla, SpaceX",United States,Texas,...,Tesla,True,M,6/28/1971,CEO,1.0,,,Elon Musk is working to revolutionize transpor...,Musk was accepted to a graduate program at Sta...
1,2,Jeff Bezos,58.0,171000,2022,4,Technology,Amazon,United States,Washington,...,Amazon,True,M,1/12/1964,Entrepreneur,1.0,"Seattle-Tacoma-Bellevue, WA",,Jeff Bezos founded e-commerce giant Amazon in ...,"Growing up, Jeff Bezos worked summers on his g..."
2,3,Bernard Arnault & family,73.0,158000,2022,4,Fashion & Retail,LVMH,France,,...,LVMH Moët Hennessy Louis Vuitton,False,M,3/5/1949,Chairman and CEO,,,,Bernard Arnault oversees the LVMH empire of so...,"Arnault apparently wooed his wife, Helene Merc..."
3,4,Bill Gates,66.0,129000,2022,4,Technology,Microsoft,United States,Washington,...,Bill & Melinda Gates Foundation,True,M,10/28/1955,Cofounder,4.0,"Seattle-Tacoma-Bellevue, WA",,Bill Gates turned his fortune from software fi...,"When Gates was a kid, he spent so much time re..."
4,5,Warren Buffett,91.0,118000,2022,4,Finance & Investments,Berkshire Hathaway,United States,Nebraska,...,Berkshire Hathaway,True,M,8/30/1930,CEO,5.0,"Omaha, NE",,"Known as the ""Oracle of Omaha,"" Warren Buffett...","Buffett still lives in the same Omaha, Nebrask..."


In [3]:
# print column names
df.columns


Index(['rank', 'personName', 'age', 'finalWorth', 'year', 'month', 'category',
       'source', 'country', 'state', 'city', 'countryOfCitizenship',
       'organization', 'selfMade', 'gender', 'birthDate', 'title',
       'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about'],
      dtype='object')

In [4]:
# drop unnecessary columns ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio', 'about', 'title']
cols = ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about', 'title', 'birthDate', 'selfMade']

df = df.drop(columns=cols)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,M
3,4,Bill Gates,66.0,129000,Technology,United States,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,M


In [5]:
# Check data types
df.dtypes


rank            int64
personName     object
age           float64
finalWorth      int64
category       object
country        object
gender         object
dtype: object

In [6]:
# count columns
df.count()

rank          2668
personName    2668
age           2582
finalWorth    2668
category      2668
country       2668
gender        2652
dtype: int64

In [7]:
# check for null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")


Column rank has 0 null values
Column personName has 0 null values
Column age has 86 null values
Column finalWorth has 0 null values
Column category has 0 null values
Column country has 0 null values
Column gender has 16 null values


In [8]:
# drop NaN values
df_cleaned = df.dropna()
df_cleaned


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,M
3,4,Bill Gates,66.0,129000,Technology,United States,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,M
...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,F


In [9]:
# check for duplicates
df_cleaned.drop_duplicates()

# Note: when comparing drop_duplicates dataframe with the above dataframe,
# there are the same number of rows - there are no duplicates in the data.


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,M
3,4,Bill Gates,66.0,129000,Technology,United States,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,M
...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,F


In [10]:
copy_df_cleaned = df_cleaned.copy()
copy_df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,M
3,4,Bill Gates,66.0,129000,Technology,United States,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,M
...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,F


In [11]:
copy_df_cleaned["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df_cleaned["finalWorth"]]
copy_df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender,finalWorth>4799
0,1,Elon Musk,50.0,219000,Automotive,United States,M,1
1,2,Jeff Bezos,58.0,171000,Technology,United States,M,1
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,M,1
3,4,Bill Gates,66.0,129000,Technology,United States,M,1
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,M,1
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,M,0
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,M,0
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,M,0
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,F,0


In [12]:
# save this version of the data for the Tableau visualizations
copy_df_cleaned.to_csv('Resources/finalWorth_labeled_cleaned_billionaire.csv', index=False)

In [13]:
# Encoding gender to numerical binary values in cleaned dataframe. This will split the gender column into seperate columns on for female and one for male
# 0=false and 1=true 
df_cleaned = pd.get_dummies(df_cleaned, columns=["gender"])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50.0,219000,Automotive,United States,0,1
1,2,Jeff Bezos,58.0,171000,Technology,United States,0,1
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,0,1
3,4,Bill Gates,66.0,129000,Technology,United States,0,1
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,0,1
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,0,1
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,0,1
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,1,0


In [14]:
# Convert country column into numerical values
# Since there are more than two countries we can't use a numerical binary value to encode the country column
# Instead we have to create a custom country dictionary, and assign a specific numerical value for each unique country
# Process below is turning the country column into a list, remove duplicates, assign numerical values, and then encode the country column in the cleaned_df

# Convert Country Column to list 
country_list = df_cleaned["country"].values.tolist()

# Remove duplicate values from country list
country_list = list(set(country_list))
country_list

['Oman',
 'New Zealand',
 'Algeria',
 'Chile',
 'Argentina',
 'Guernsey',
 'Nepal',
 'Indonesia',
 'Ireland',
 'Japan',
 'South Africa',
 'Israel',
 'Liechtenstein',
 'Thailand',
 'British Virgin Islands',
 'Turks and Caicos Islands',
 'Hungary',
 'Hong Kong',
 'United Arab Emirates',
 'Eswatini (Swaziland)',
 'Malaysia',
 'Kazakhstan',
 'Italy',
 'Brazil',
 'Switzerland',
 'Canada',
 'Ukraine',
 'Uruguay',
 'Czechia',
 'Colombia',
 'Monaco',
 'Norway',
 'Mexico',
 'Netherlands',
 'China',
 'United Kingdom',
 'Tanzania',
 'Sweden',
 'Turkey',
 'Cambodia',
 'Egypt',
 'Qatar',
 'Vietnam',
 'Andorra',
 'Slovakia',
 'Cayman Islands',
 'Nigeria',
 'Philippines',
 'Greece',
 'United States',
 'Peru',
 'Bahrain',
 'Singapore',
 'Spain',
 'Australia',
 'Romania',
 'India',
 'Bahamas',
 'Portugal',
 'Russia',
 'Morocco',
 'Finland',
 'Lebanon',
 'Taiwan',
 'South Korea',
 'Poland',
 'Germany',
 'Georgia',
 'Bermuda',
 'Austria',
 'Denmark',
 'France',
 'Belgium']

In [15]:
# Enumerate list and convert to dictionary
country_dict = {key: i for i, key in enumerate(country_list)}
country_dict

{'Oman': 0,
 'New Zealand': 1,
 'Algeria': 2,
 'Chile': 3,
 'Argentina': 4,
 'Guernsey': 5,
 'Nepal': 6,
 'Indonesia': 7,
 'Ireland': 8,
 'Japan': 9,
 'South Africa': 10,
 'Israel': 11,
 'Liechtenstein': 12,
 'Thailand': 13,
 'British Virgin Islands': 14,
 'Turks and Caicos Islands': 15,
 'Hungary': 16,
 'Hong Kong': 17,
 'United Arab Emirates': 18,
 'Eswatini (Swaziland)': 19,
 'Malaysia': 20,
 'Kazakhstan': 21,
 'Italy': 22,
 'Brazil': 23,
 'Switzerland': 24,
 'Canada': 25,
 'Ukraine': 26,
 'Uruguay': 27,
 'Czechia': 28,
 'Colombia': 29,
 'Monaco': 30,
 'Norway': 31,
 'Mexico': 32,
 'Netherlands': 33,
 'China': 34,
 'United Kingdom': 35,
 'Tanzania': 36,
 'Sweden': 37,
 'Turkey': 38,
 'Cambodia': 39,
 'Egypt': 40,
 'Qatar': 41,
 'Vietnam': 42,
 'Andorra': 43,
 'Slovakia': 44,
 'Cayman Islands': 45,
 'Nigeria': 46,
 'Philippines': 47,
 'Greece': 48,
 'United States': 49,
 'Peru': 50,
 'Bahrain': 51,
 'Singapore': 52,
 'Spain': 53,
 'Australia': 54,
 'Romania': 55,
 'India': 56,
 'Baha

In [16]:
# Custom numerical coding for country 
df_cleaned["country"] = df_cleaned["country"].apply(lambda x: country_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50.0,219000,Automotive,49,0,1
1,2,Jeff Bezos,58.0,171000,Technology,49,0,1
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,71,0,1
3,4,Bill Gates,66.0,129000,Technology,49,0,1
4,5,Warren Buffett,91.0,118000,Finance & Investments,49,0,1
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,34,0,1
2664,2578,Zhou Ruxin,59.0,1000,Technology,34,0,1
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,34,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,34,1,0


In [17]:
# Convert category column to list 
category_list = df_cleaned["category"].values.tolist()

# Remove duplicate values from category list
category_list = list(set(category_list))

# Enumerate list and convert to dictionary
category_dict = {key: i for i, key in enumerate(category_list)}
category_dict

{'Finance & Investments': 0,
 'Media & Entertainment': 1,
 'Technology': 2,
 'Manufacturing': 3,
 'Diversified': 4,
 'Logistics': 5,
 'Automotive': 6,
 'Healthcare': 7,
 'Service': 8,
 'Sports': 9,
 'Food & Beverage': 10,
 'Gambling & Casinos': 11,
 'Real Estate': 12,
 'Energy': 13,
 'Metals & Mining': 14,
 'Fashion & Retail': 15,
 'Construction & Engineering': 16,
 'Telecom': 17}

In [18]:
# Custom numerical coding for country 
df_cleaned["category"] = df_cleaned["category"].apply(lambda x: category_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50.0,219000,6,49,0,1
1,2,Jeff Bezos,58.0,171000,2,49,0,1
2,3,Bernard Arnault & family,73.0,158000,15,71,0,1
3,4,Bill Gates,66.0,129000,2,49,0,1
4,5,Warren Buffett,91.0,118000,0,49,0,1
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,3,34,0,1
2664,2578,Zhou Ruxin,59.0,1000,2,34,0,1
2665,2578,Wen Zhou & family,57.0,1000,3,34,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,13,34,1,0


In [19]:
# Convert age from float dtype to int
df_cleaned['age'] = df_cleaned['age'].astype(int)
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50,219000,6,49,0,1
1,2,Jeff Bezos,58,171000,2,49,0,1
2,3,Bernard Arnault & family,73,158000,15,71,0,1
3,4,Bill Gates,66,129000,2,49,0,1
4,5,Warren Buffett,91,118000,0,49,0,1
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66,1000,3,34,0,1
2664,2578,Zhou Ruxin,59,1000,2,34,0,1
2665,2578,Wen Zhou & family,57,1000,3,34,0,1
2666,2578,Zhou Yifeng & family,43,1000,13,34,1,0


In [20]:
df_cleaned.dtypes

rank           int64
personName    object
age            int64
finalWorth     int64
category       int64
country        int64
gender_F       uint8
gender_M       uint8
dtype: object

In [21]:
# saving cleaned data to csv

file_path = "Resources/cleaned_billionaire.csv"
df_cleaned.to_csv(file_path, index=False)

