In [1]:
# import dependencies
from pathlib import Path
import pandas as pd


In [2]:
# read the forbes_2022_billionaires.csv file into a dataframe
data = Path('Resources/forbes_2018_billionaires.csv')
df = pd.read_csv(data)
df.head()


Unnamed: 0,Rank,Name,Age,Source,Industry,Gender,Continent,Country,Headquarters,State,Net Worth,Title
0,1.0,Jeff Bezos,54.0,Amazon,Technology,Male,North America,United States,WA,Washington,112.0,"CEO and Founder, Amazon.com"
1,2.0,Bill Gates,62.0,Microsoft,Technology,Male,North America,United States,WA,Washington,90.0,"Cofounder, Bill & Melinda Gates Foundation"
2,3.0,Warren Buffett,87.0,Berkshire Hathaway,Finance and Investments,Male,North America,United States,NE,Nebraska,84.0,"CEO, Berkshire Hathaway"
3,4.0,Bernard Arnault,69.0,LVMH,Fashion & Retail,Male,Europe,France,,,72.0,"Chairman and CEO, LVMH Moet Hennessy Louis Vui..."
4,5.0,Mark Zuckerberg,33.0,Facebook,Technology,Male,North America,United States,CA,California,71.0,"Cofounder, Chairman and CEO, Facebook"


In [3]:
# print column names
df.columns


Index(['Rank', 'Name', 'Age', 'Source', 'Industry', 'Gender', 'Continent',
       'Country', 'Headquarters', 'State', 'Net Worth', 'Title'],
      dtype='object')

In [4]:
# drop unnecessary columns ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio', 'about', 'title']
cols = ['Continent', 'Source', 'State', 'Headquarters', 'Title']

df = df.drop(columns=cols)
df.head()


Unnamed: 0,Rank,Name,Age,Industry,Gender,Country,Net Worth
0,1.0,Jeff Bezos,54.0,Technology,Male,United States,112.0
1,2.0,Bill Gates,62.0,Technology,Male,United States,90.0
2,3.0,Warren Buffett,87.0,Finance and Investments,Male,United States,84.0
3,4.0,Bernard Arnault,69.0,Fashion & Retail,Male,France,72.0
4,5.0,Mark Zuckerberg,33.0,Technology,Male,United States,71.0


In [5]:
# Rename columns
df = df.rename(columns={"Rank":"rank", "Name":"personName", "Age":"age", "Industry":"category", "Gender":"gender", "Country":"country", "Net Worth":"finalWorth"})
df.head()


Unnamed: 0,rank,personName,age,category,gender,country,finalWorth
0,1.0,Jeff Bezos,54.0,Technology,Male,United States,112.0
1,2.0,Bill Gates,62.0,Technology,Male,United States,90.0
2,3.0,Warren Buffett,87.0,Finance and Investments,Male,United States,84.0
3,4.0,Bernard Arnault,69.0,Fashion & Retail,Male,France,72.0
4,5.0,Mark Zuckerberg,33.0,Technology,Male,United States,71.0


In [6]:
# Reorder columns
df = df[["rank", "personName", "age", "finalWorth", "category", "country","gender"]]
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance and Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male


In [7]:
# Replace values in the 'category' column: Change 'and' to '&'
df["category"] = [x.replace("and","&") for x in df["category"]]

df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance & Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male


In [8]:
# Drop NaN values
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1.0,Jeff Bezos,54.0,112.0,Technology,United States,Male
1,2.0,Bill Gates,62.0,90.0,Technology,United States,Male
2,3.0,Warren Buffett,87.0,84.0,Finance & Investments,United States,Male
3,4.0,Bernard Arnault,69.0,72.0,Fashion & Retail,France,Male
4,5.0,Mark Zuckerberg,33.0,71.0,Technology,United States,Male
...,...,...,...,...,...,...,...
2203,2124.0,Zhao Xiaoqiang,50.0,1.0,Fashion & Retail,China,Male
2204,2124.0,Zhou Liangzhang,55.0,1.0,Manufacturing,China,Male
2205,2124.0,Zhu Xingming,51.0,1.0,Manufacturing,China,Male
2206,2124.0,Zhuo Jun,52.0,1.0,Manufacturing,Hong Kong,Female


In [9]:
# Check data types
df_cleaned.dtypes


rank          float64
personName     object
age           float64
finalWorth    float64
category       object
country        object
gender         object
dtype: object

In [10]:
# Change rank and finalWorth into int64 data types
df_cleaned[["rank", "finalWorth"]] = df_cleaned[["rank", "finalWorth"]].astype('int64')
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[["rank", "finalWorth"]] = df_cleaned[["rank", "finalWorth"]].astype('int64')


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112,Technology,United States,Male
1,2,Bill Gates,62.0,90,Technology,United States,Male
2,3,Warren Buffett,87.0,84,Finance & Investments,United States,Male
3,4,Bernard Arnault,69.0,72,Fashion & Retail,France,Male
4,5,Mark Zuckerberg,33.0,71,Technology,United States,Male


In [11]:
# x1000 for each value in finalWorth to standardize the values between the 2018 and 2022 data
df_cleaned["finalWorth"] = [x * 1000 for x in df_cleaned["finalWorth"]]
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["finalWorth"] = [x * 1000 for x in df_cleaned["finalWorth"]]


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,Male
1,2,Bill Gates,62.0,90000,Technology,United States,Male
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,Male
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,Male
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,Male


In [12]:
df_cleaned["gender"] = [x[0] if x == "Male" else x[0] for x in df_cleaned["gender"]]
df_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["gender"] = [x[0] if x == "Male" else x[0] for x in df_cleaned["gender"]]


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,M
1,2,Bill Gates,62.0,90000,Technology,United States,M
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,M
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,M
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,M


In [13]:
# check for duplicates
df_cleaned.drop_duplicates()

# Note: when comparing drop_duplicates dataframe with the above dataframe,
# there are the same number of rows - there are no duplicates in the data.


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender
0,1,Jeff Bezos,54.0,112000,Technology,United States,M
1,2,Bill Gates,62.0,90000,Technology,United States,M
2,3,Warren Buffett,87.0,84000,Finance & Investments,United States,M
3,4,Bernard Arnault,69.0,72000,Fashion & Retail,France,M
4,5,Mark Zuckerberg,33.0,71000,Technology,United States,M
...,...,...,...,...,...,...,...
2203,2124,Zhao Xiaoqiang,50.0,1000,Fashion & Retail,China,M
2204,2124,Zhou Liangzhang,55.0,1000,Manufacturing,China,M
2205,2124,Zhu Xingming,51.0,1000,Manufacturing,China,M
2206,2124,Zhuo Jun,52.0,1000,Manufacturing,Hong Kong,F


In [14]:
# saving cleaned data to csv

file_path = "Resources/cleaned_2018_billionaire.csv"
df_cleaned.to_csv(file_path, index=False)
