In [42]:
# import dependencies
from pathlib import Path
import pandas as pd


In [43]:
# read the forbes_2022_billionaires.csv file into a dataframe
data = Path('Resources/forbes_2018_billionaires.csv')
df = pd.read_csv(data)
df.head()


Unnamed: 0,Rank,Name,Age,Source,Industry,Gender,Continent,Country,Headquarters,State,Net Worth,Title
0,1.0,Jeff Bezos,54.0,Amazon,Technology,Male,North America,United States,WA,Washington,112.0,"CEO and Founder, Amazon.com"
1,2.0,Bill Gates,62.0,Microsoft,Technology,Male,North America,United States,WA,Washington,90.0,"Cofounder, Bill & Melinda Gates Foundation"
2,3.0,Warren Buffett,87.0,Berkshire Hathaway,Finance and Investments,Male,North America,United States,NE,Nebraska,84.0,"CEO, Berkshire Hathaway"
3,4.0,Bernard Arnault,69.0,LVMH,Fashion & Retail,Male,Europe,France,,,72.0,"Chairman and CEO, LVMH Moet Hennessy Louis Vui..."
4,5.0,Mark Zuckerberg,33.0,Facebook,Technology,Male,North America,United States,CA,California,71.0,"Cofounder, Chairman and CEO, Facebook"


In [44]:
# print column names
df.columns


Index(['Rank', 'Name', 'Age', 'Source', 'Industry', 'Gender', 'Continent',
       'Country', 'Headquarters', 'State', 'Net Worth', 'Title'],
      dtype='object')

In [45]:
# drop unnecessary columns ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio', 'about', 'title']
cols = ['Continent', 'Source', 'State', 'Headquarters', 'Title']

df = df.drop(columns=cols)
df.head()


Unnamed: 0,Rank,Name,Age,Industry,Gender,Country,Net Worth
0,1.0,Jeff Bezos,54.0,Technology,Male,United States,112.0
1,2.0,Bill Gates,62.0,Technology,Male,United States,90.0
2,3.0,Warren Buffett,87.0,Finance and Investments,Male,United States,84.0
3,4.0,Bernard Arnault,69.0,Fashion & Retail,Male,France,72.0
4,5.0,Mark Zuckerberg,33.0,Technology,Male,United States,71.0


In [46]:
# Rename columns
df = df.rename(columns={"Rank":"rank", "Name":"personName", "Age":"age", "Industry":"category", "Gender":"gender", "Country":"country", "Net Worth":"finalWorth"})
df.head()


Unnamed: 0,rank,personName,age,category,gender,country,finalWorth
0,1.0,Jeff Bezos,54.0,Technology,Male,United States,112.0
1,2.0,Bill Gates,62.0,Technology,Male,United States,90.0
2,3.0,Warren Buffett,87.0,Finance and Investments,Male,United States,84.0
3,4.0,Bernard Arnault,69.0,Fashion & Retail,Male,France,72.0
4,5.0,Mark Zuckerberg,33.0,Technology,Male,United States,71.0


In [47]:
# Reorder columns
df = df[["rank", "personName", "age", "finalWorth", "category", "country","gender"]]
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance and Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male


In [48]:
# Replace values in the 'category' column: Change 'and' to '&'
df["category"] = [x.replace("and","&") for x in df["category"]]

df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance & Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male


In [49]:
# Drop NaN values
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance & Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male
...,...,...,...,...,...,...,...
2203,2124.0,Zhao Xiaoqiang,50.0,1.0,Fashion & Retail,China,Male
2204,2124.0,Zhou Liangzhang,55.0,1.0,Manufacturing,China,Male
2205,2124.0,Zhu Xingming,51.0,1.0,Manufacturing,China,Male
2206,2124.0,Zhuo Jun,52.0,1.0,Manufacturing,Hong Kong,Female


In [50]:
# Check data types
df_cleaned.dtypes


rank          float64
personName     object
age           float64
finalWorth    float64
category       object
country        object
gender         object
dtype: object

In [51]:
# Change rank and finalWorth into int64 data types
df_cleaned[["rank", "finalWorth"]] = df_cleaned[["rank", "finalWorth"]].astype('int64')
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[["rank", "finalWorth"]] = df_cleaned[["rank", "finalWorth"]].astype('int64')


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112,Technology,United States,Male
1,2,Bill Gates,62.0,90,Technology,United States,Male
2,3,Warren Buffett,87.0,84,Finance & Investments,United States,Male
3,4,Bernard Arnault,69.0,72,Fashion & Retail,France,Male
4,5,Mark Zuckerberg,33.0,71,Technology,United States,Male


In [52]:
# x1000 for each value in finalWorth to standardize the values between the 2018 and 2022 data
df_cleaned["finalWorth"] = [x * 1000 for x in df_cleaned["finalWorth"]]
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["finalWorth"] = [x * 1000 for x in df_cleaned["finalWorth"]]


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,Male
1,2,Bill Gates,62.0,90000,Technology,United States,Male
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,Male
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,Male
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,Male


In [53]:
df_cleaned["gender"] = [x[0] if x == "Male" else x[0] for x in df_cleaned["gender"]]
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["gender"] = [x[0] if x == "Male" else x[0] for x in df_cleaned["gender"]]


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,M
1,2,Bill Gates,62.0,90000,Technology,United States,M
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,M
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,M
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,M


In [54]:
# check for duplicates
df_cleaned.drop_duplicates()

# Note: when comparing drop_duplicates dataframe with the above dataframe,
# there are the same number of rows - there are no duplicates in the data.



Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,M
1,2,Bill Gates,62.0,90000,Technology,United States,M
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,M
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,M
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,M
...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50.0,1000,Fashion & Retail,China,M
2204,2124,Zhou Liangzhang,55.0,1000,Manufacturing,China,M
2205,2124,Zhu Xingming,51.0,1000,Manufacturing,China,M
2206,2124,Zhuo Jun,52.0,1000,Manufacturing,Hong Kong,F


In [55]:
# Encoding gender to numerical binary values in cleaned dataframe. This will split the gender column into seperate columns on for female and one for male
# 0=false and 1=true 
df_cleaned = pd.get_dummies(df_cleaned, columns=["gender"])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54.0,112000,Technology,United States,0,1
1,2,Bill Gates,62.0,90000,Technology,United States,0,1
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,0,1
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,0,1
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,0,1
...,...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50.0,1000,Fashion & Retail,China,0,1
2204,2124,Zhou Liangzhang,55.0,1000,Manufacturing,China,0,1
2205,2124,Zhu Xingming,51.0,1000,Manufacturing,China,0,1
2206,2124,Zhuo Jun,52.0,1000,Manufacturing,Hong Kong,1,0


In [56]:
# Convert country column into numerical values
# Since there are more than two countries we can't use a numerical binary value to encode the country column
# Instead we have to create a custom country dictionary, and assign a specific numerical value for each unique country
# Process below is turning the country column into a list, remove duplicates, assign numerical values, and then encode the country column in the cleaned_df

# Convert Country Column to list 
country_list = df_cleaned["country"].values.tolist()

# Remove duplicate values from country list
country_list = list(set(country_list))
country_list


['Austria',
 'Hong Kong',
 'Ireland',
 'United Kingdom',
 'Chile',
 'Cyprus',
 'Portugal',
 'Liechtenstein',
 'Malaysia',
 'Kazakhstan',
 'Finland',
 'Iceland',
 'Czech Republic',
 'Turkey',
 'Guernsey',
 'Algeria',
 'Peru',
 'Brazil',
 'Sweden',
 'Romania',
 'United States',
 'Belgium',
 'Lebanon',
 'Tanzania',
 'Taiwan',
 'Morocco',
 'China',
 'Japan',
 'Angola',
 'Poland',
 'Israel',
 'South Africa',
 'Venezuela',
 'Monaco',
 'Ukraine',
 'Russia',
 'France',
 'Mexico',
 'Spain',
 'Georgia',
 'Nepal',
 'Kuwait',
 'Nigeria',
 'Colombia',
 'India',
 'Philippines',
 'Macau',
 'Zimbabwe',
 'Italy',
 'Canada',
 'St. Kitts and Nevis',
 'Slovakia',
 'Greece',
 'South Korea',
 'Thailand',
 'United Arab Emirates',
 'Netherlands',
 'Denmark',
 'Argentina',
 'Vietnam',
 'Qatar',
 'Hungary',
 'Germany',
 'Indonesia',
 'Swaziland',
 'Australia',
 'Oman',
 'Norway',
 'Egypt',
 'Singapore',
 'Switzerland',
 'New Zealand']

In [57]:
# Enumerate list and convert to dictionary
country_dict = {key: i for i, key in enumerate(country_list)}
country_dict

{'Austria': 0,
 'Hong Kong': 1,
 'Ireland': 2,
 'United Kingdom': 3,
 'Chile': 4,
 'Cyprus': 5,
 'Portugal': 6,
 'Liechtenstein': 7,
 'Malaysia': 8,
 'Kazakhstan': 9,
 'Finland': 10,
 'Iceland': 11,
 'Czech Republic': 12,
 'Turkey': 13,
 'Guernsey': 14,
 'Algeria': 15,
 'Peru': 16,
 'Brazil': 17,
 'Sweden': 18,
 'Romania': 19,
 'United States': 20,
 'Belgium': 21,
 'Lebanon': 22,
 'Tanzania': 23,
 'Taiwan': 24,
 'Morocco': 25,
 'China': 26,
 'Japan': 27,
 'Angola': 28,
 'Poland': 29,
 'Israel': 30,
 'South Africa': 31,
 'Venezuela': 32,
 'Monaco': 33,
 'Ukraine': 34,
 'Russia': 35,
 'France': 36,
 'Mexico': 37,
 'Spain': 38,
 'Georgia': 39,
 'Nepal': 40,
 'Kuwait': 41,
 'Nigeria': 42,
 'Colombia': 43,
 'India': 44,
 'Philippines': 45,
 'Macau': 46,
 'Zimbabwe': 47,
 'Italy': 48,
 'Canada': 49,
 'St. Kitts and Nevis': 50,
 'Slovakia': 51,
 'Greece': 52,
 'South Korea': 53,
 'Thailand': 54,
 'United Arab Emirates': 55,
 'Netherlands': 56,
 'Denmark': 57,
 'Argentina': 58,
 'Vietnam': 59,

In [58]:
# Custom numerical coding for country 
df_cleaned["country"] = df_cleaned["country"].apply(lambda x: country_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54.0,112000,Technology,20,0,1
1,2,Bill Gates,62.0,90000,Technology,20,0,1
2,3,Warren Buffett,87.0,84000,Finance & Investments,20,0,1
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,36,0,1
4,5,Mark Zuckerberg,33.0,71000,Technology,20,0,1
...,...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50.0,1000,Fashion & Retail,26,0,1
2204,2124,Zhou Liangzhang,55.0,1000,Manufacturing,26,0,1
2205,2124,Zhu Xingming,51.0,1000,Manufacturing,26,0,1
2206,2124,Zhuo Jun,52.0,1000,Manufacturing,1,1,0


In [59]:
# Convert category column to list 
category_list = df_cleaned["category"].values.tolist()

# Remove duplicate values from category list
category_list = list(set(category_list))

# Enumerate list and convert to dictionary
category_dict = {key: i for i, key in enumerate(category_list)}
category_dict



{'Metals & Mining': 0,
 'Healthcare': 1,
 'Energy': 2,
 'Construction & Engineering': 3,
 'Gambling & Casinos': 4,
 'Media & Entertainment': 5,
 'Logistics': 6,
 'Automotive': 7,
 'Technology': 8,
 'Sports': 9,
 'Finance & Investments': 10,
 'Service': 11,
 'Manufacturing': 12,
 'Diversified': 13,
 'Real Estate': 14,
 'Philanthropy/NGO': 15,
 'Food & Beverage': 16,
 'Fashion & Retail': 17,
 'Telecom': 18}

In [61]:
# Custom numerical coding for country 
df_cleaned["category"] = df_cleaned["category"].apply(lambda x: category_dict[x])
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54.0,112000,8,20,0,1
1,2,Bill Gates,62.0,90000,8,20,0,1
2,3,Warren Buffett,87.0,84000,10,20,0,1
3,4,Bernard Arnault,69.0,72000,17,36,0,1
4,5,Mark Zuckerberg,33.0,71000,8,20,0,1
...,...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50.0,1000,17,26,0,1
2204,2124,Zhou Liangzhang,55.0,1000,12,26,0,1
2205,2124,Zhu Xingming,51.0,1000,12,26,0,1
2206,2124,Zhuo Jun,52.0,1000,12,1,1,0


In [65]:
# Convert age from float dtype to int
df_cleaned['age'] = df_cleaned['age'].astype(int)
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54,112000,8,20,0,1
1,2,Bill Gates,62,90000,8,20,0,1
2,3,Warren Buffett,87,84000,10,20,0,1
3,4,Bernard Arnault,69,72000,17,36,0,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1
...,...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50,1000,17,26,0,1
2204,2124,Zhou Liangzhang,55,1000,12,26,0,1
2205,2124,Zhu Xingming,51,1000,12,26,0,1
2206,2124,Zhuo Jun,52,1000,12,1,1,0


In [66]:
df_cleaned.dtypes

rank           int64
personName    object
age            int64
finalWorth     int64
category       int64
country        int64
gender_F       uint8
gender_M       uint8
dtype: object

In [67]:
# saving cleaned data to csv
file_path = "Resources/cleaned_2018_billionaire.csv"
df_cleaned.to_csv(file_path, index=False)
