In [1]:
# import dependencies
from pathlib import Path
import pandas as pd


In [2]:
# read the forbes_2022_billionaires.csv file into a dataframe
data = Path('Resources/forbes_2022_billionaires.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,year,month,category,source,country,state,...,organization,selfMade,gender,birthDate,title,philanthropyScore,residenceMsa,numberOfSiblings,bio,about
0,1,Elon Musk,50.0,219000,2022,4,Automotive,"Tesla, SpaceX",United States,Texas,...,Tesla,True,M,6/28/1971,CEO,1.0,,,Elon Musk is working to revolutionize transpor...,Musk was accepted to a graduate program at Sta...
1,2,Jeff Bezos,58.0,171000,2022,4,Technology,Amazon,United States,Washington,...,Amazon,True,M,1/12/1964,Entrepreneur,1.0,"Seattle-Tacoma-Bellevue, WA",,Jeff Bezos founded e-commerce giant Amazon in ...,"Growing up, Jeff Bezos worked summers on his g..."
2,3,Bernard Arnault & family,73.0,158000,2022,4,Fashion & Retail,LVMH,France,,...,LVMH Moët Hennessy Louis Vuitton,False,M,3/5/1949,Chairman and CEO,,,,Bernard Arnault oversees the LVMH empire of so...,"Arnault apparently wooed his wife, Helene Merc..."
3,4,Bill Gates,66.0,129000,2022,4,Technology,Microsoft,United States,Washington,...,Bill & Melinda Gates Foundation,True,M,10/28/1955,Cofounder,4.0,"Seattle-Tacoma-Bellevue, WA",,Bill Gates turned his fortune from software fi...,"When Gates was a kid, he spent so much time re..."
4,5,Warren Buffett,91.0,118000,2022,4,Finance & Investments,Berkshire Hathaway,United States,Nebraska,...,Berkshire Hathaway,True,M,8/30/1930,CEO,5.0,"Omaha, NE",,"Known as the ""Oracle of Omaha,"" Warren Buffett...","Buffett still lives in the same Omaha, Nebrask..."


In [3]:
# print column names
df.columns


Index(['rank', 'personName', 'age', 'finalWorth', 'year', 'month', 'category',
       'source', 'country', 'state', 'city', 'countryOfCitizenship',
       'organization', 'selfMade', 'gender', 'birthDate', 'title',
       'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about'],
      dtype='object')

In [4]:
# drop unnecessary columns ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio', 'about', 'title']
cols = ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about', 'title', 'birthDate']

df = df.drop(columns=cols)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M


In [5]:
# Check data types
df.dtypes


rank            int64
personName     object
age           float64
finalWorth      int64
category       object
country        object
selfMade         bool
gender         object
dtype: object

In [6]:
# count columns
df.count()

rank          2668
personName    2668
age           2582
finalWorth    2668
category      2668
country       2668
selfMade      2668
gender        2652
dtype: int64

In [7]:
# check for null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")


Column rank has 0 null values
Column personName has 0 null values
Column age has 86 null values
Column finalWorth has 0 null values
Column category has 0 null values
Column country has 0 null values
Column selfMade has 0 null values
Column gender has 16 null values


In [8]:
# drop NaN values
df_cleaned = df.dropna()
df_cleaned


Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,True,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,True,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,True,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,True,F


In [9]:
# check for duplicates
df_cleaned.drop_duplicates()

# Note: when comparing drop_duplicates dataframe with the above dataframe,
# there are the same number of rows - there are no duplicates in the data.


Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,True,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,True,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,True,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,True,F


In [10]:
# Encoding gender to numerical binary values in cleaned dataframe. This will split the gender column into seperate columns on for female and one for male
# 0=false and 1=true 
df_cleaned = pd.get_dummies(df_cleaned, columns=["gender"])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender_F,gender_M
0,1,Elon Musk,50.0,219000,Automotive,United States,True,0,1
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,0,1
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,0,1
3,4,Bill Gates,66.0,129000,Technology,United States,True,0,1
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,0,1
...,...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,True,0,1
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,True,0,1
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,True,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,True,1,0


In [11]:
# Convert country column into numerical values
# Since there are more than two countries we can't use a numerical binary value to encode the country column
# Instead we have to create a custom country dictionary, and assign a specific numerical value for each unique country
# Process below is turning the country column into a list, remove duplicates, assign numerical values, and then encode the country column in the cleaned_df

# Convert Country Column to list 
country_list = df_cleaned["country"].values.tolist()

# Remove duplicate values from country list
country_list = list(set(country_list))
country_list

['United States',
 'Netherlands',
 'Belgium',
 'Nepal',
 'Qatar',
 'United Kingdom',
 'Thailand',
 'Kazakhstan',
 'Nigeria',
 'Uruguay',
 'Turks and Caicos Islands',
 'Egypt',
 'Greece',
 'Hungary',
 'Finland',
 'Philippines',
 'Poland',
 'France',
 'Guernsey',
 'India',
 'Tanzania',
 'Chile',
 'Eswatini (Swaziland)',
 'Georgia',
 'Monaco',
 'Russia',
 'Taiwan',
 'Lebanon',
 'Colombia',
 'Austria',
 'Liechtenstein',
 'Portugal',
 'Vietnam',
 'Bermuda',
 'Spain',
 'Norway',
 'Malaysia',
 'United Arab Emirates',
 'British Virgin Islands',
 'South Korea',
 'Cambodia',
 'Germany',
 'Sweden',
 'Bahrain',
 'Indonesia',
 'Switzerland',
 'Australia',
 'Ukraine',
 'Andorra',
 'Oman',
 'Bahamas',
 'Mexico',
 'Italy',
 'Brazil',
 'Israel',
 'Morocco',
 'Slovakia',
 'Peru',
 'New Zealand',
 'Denmark',
 'South Africa',
 'Japan',
 'Algeria',
 'Singapore',
 'Argentina',
 'Czechia',
 'Cayman Islands',
 'Turkey',
 'Romania',
 'China',
 'Hong Kong',
 'Canada',
 'Ireland']

In [12]:
# Enumerate list and convert to dictionary
country_dict = {key: i for i, key in enumerate(country_list)}
country_dict

{'United States': 0,
 'Netherlands': 1,
 'Belgium': 2,
 'Nepal': 3,
 'Qatar': 4,
 'United Kingdom': 5,
 'Thailand': 6,
 'Kazakhstan': 7,
 'Nigeria': 8,
 'Uruguay': 9,
 'Turks and Caicos Islands': 10,
 'Egypt': 11,
 'Greece': 12,
 'Hungary': 13,
 'Finland': 14,
 'Philippines': 15,
 'Poland': 16,
 'France': 17,
 'Guernsey': 18,
 'India': 19,
 'Tanzania': 20,
 'Chile': 21,
 'Eswatini (Swaziland)': 22,
 'Georgia': 23,
 'Monaco': 24,
 'Russia': 25,
 'Taiwan': 26,
 'Lebanon': 27,
 'Colombia': 28,
 'Austria': 29,
 'Liechtenstein': 30,
 'Portugal': 31,
 'Vietnam': 32,
 'Bermuda': 33,
 'Spain': 34,
 'Norway': 35,
 'Malaysia': 36,
 'United Arab Emirates': 37,
 'British Virgin Islands': 38,
 'South Korea': 39,
 'Cambodia': 40,
 'Germany': 41,
 'Sweden': 42,
 'Bahrain': 43,
 'Indonesia': 44,
 'Switzerland': 45,
 'Australia': 46,
 'Ukraine': 47,
 'Andorra': 48,
 'Oman': 49,
 'Bahamas': 50,
 'Mexico': 51,
 'Italy': 52,
 'Brazil': 53,
 'Israel': 54,
 'Morocco': 55,
 'Slovakia': 56,
 'Peru': 57,
 'New

In [13]:
# Custom numerical coding for country 
df_cleaned["country"] = df_cleaned["country"].apply(lambda x: country_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender_F,gender_M
0,1,Elon Musk,50.0,219000,Automotive,0,True,0,1
1,2,Jeff Bezos,58.0,171000,Technology,0,True,0,1
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,17,False,0,1
3,4,Bill Gates,66.0,129000,Technology,0,True,0,1
4,5,Warren Buffett,91.0,118000,Finance & Investments,0,True,0,1
...,...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,69,True,0,1
2664,2578,Zhou Ruxin,59.0,1000,Technology,69,True,0,1
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,69,True,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,69,True,1,0


In [14]:
# Convert category column to list 
category_list = df_cleaned["category"].values.tolist()

# Remove duplicate values from category list
category_list = list(set(category_list))

# Enumerate list and convert to dictionary
category_dict = {key: i for i, key in enumerate(category_list)}
category_dict

{'Technology': 0,
 'Construction & Engineering': 1,
 'Healthcare': 2,
 'Food & Beverage': 3,
 'Service': 4,
 'Real Estate': 5,
 'Gambling & Casinos': 6,
 'Finance & Investments': 7,
 'Automotive': 8,
 'Logistics': 9,
 'Manufacturing': 10,
 'Fashion & Retail': 11,
 'Telecom': 12,
 'Energy': 13,
 'Media & Entertainment': 14,
 'Sports': 15,
 'Metals & Mining': 16,
 'Diversified': 17}

In [15]:
# Custom numerical coding for country 
df_cleaned["category"] = df_cleaned["category"].apply(lambda x: category_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender_F,gender_M
0,1,Elon Musk,50.0,219000,8,0,True,0,1
1,2,Jeff Bezos,58.0,171000,0,0,True,0,1
2,3,Bernard Arnault & family,73.0,158000,11,17,False,0,1
3,4,Bill Gates,66.0,129000,0,0,True,0,1
4,5,Warren Buffett,91.0,118000,7,0,True,0,1
...,...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,10,69,True,0,1
2664,2578,Zhou Ruxin,59.0,1000,0,69,True,0,1
2665,2578,Wen Zhou & family,57.0,1000,10,69,True,0,1
2666,2578,Zhou Yifeng & family,43.0,1000,13,69,True,1,0


In [16]:
# Convert age from float dtype to int
df_cleaned['age'] = df_cleaned['age'].astype(int)
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender_F,gender_M
0,1,Elon Musk,50,219000,8,0,True,0,1
1,2,Jeff Bezos,58,171000,0,0,True,0,1
2,3,Bernard Arnault & family,73,158000,11,17,False,0,1
3,4,Bill Gates,66,129000,0,0,True,0,1
4,5,Warren Buffett,91,118000,7,0,True,0,1
...,...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66,1000,10,69,True,0,1
2664,2578,Zhou Ruxin,59,1000,0,69,True,0,1
2665,2578,Wen Zhou & family,57,1000,10,69,True,0,1
2666,2578,Zhou Yifeng & family,43,1000,13,69,True,1,0


In [17]:
df_cleaned.dtypes

rank           int64
personName    object
age            int64
finalWorth     int64
category       int64
country        int64
selfMade        bool
gender_F       uint8
gender_M       uint8
dtype: object

In [18]:
# saving cleaned data to csv

file_path = "Resources/cleaned_billionaire.csv"
df_cleaned.to_csv(file_path, index=False)

