In [1]:
# import dependencies
from pathlib import Path
import pandas as pd


In [2]:
# read the forbes_2022_billionaires.csv file into a dataframe
data = Path('Resources/forbes_2022_billionaires.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,year,month,category,source,country,state,...,organization,selfMade,gender,birthDate,title,philanthropyScore,residenceMsa,numberOfSiblings,bio,about
0,1,Elon Musk,50.0,219000,2022,4,Automotive,"Tesla, SpaceX",United States,Texas,...,Tesla,True,M,6/28/1971,CEO,1.0,,,Elon Musk is working to revolutionize transpor...,Musk was accepted to a graduate program at Sta...
1,2,Jeff Bezos,58.0,171000,2022,4,Technology,Amazon,United States,Washington,...,Amazon,True,M,1/12/1964,Entrepreneur,1.0,"Seattle-Tacoma-Bellevue, WA",,Jeff Bezos founded e-commerce giant Amazon in ...,"Growing up, Jeff Bezos worked summers on his g..."
2,3,Bernard Arnault & family,73.0,158000,2022,4,Fashion & Retail,LVMH,France,,...,LVMH Moët Hennessy Louis Vuitton,False,M,3/5/1949,Chairman and CEO,,,,Bernard Arnault oversees the LVMH empire of so...,"Arnault apparently wooed his wife, Helene Merc..."
3,4,Bill Gates,66.0,129000,2022,4,Technology,Microsoft,United States,Washington,...,Bill & Melinda Gates Foundation,True,M,10/28/1955,Cofounder,4.0,"Seattle-Tacoma-Bellevue, WA",,Bill Gates turned his fortune from software fi...,"When Gates was a kid, he spent so much time re..."
4,5,Warren Buffett,91.0,118000,2022,4,Finance & Investments,Berkshire Hathaway,United States,Nebraska,...,Berkshire Hathaway,True,M,8/30/1930,CEO,5.0,"Omaha, NE",,"Known as the ""Oracle of Omaha,"" Warren Buffett...","Buffett still lives in the same Omaha, Nebrask..."


In [3]:
# print column names
df.columns


Index(['rank', 'personName', 'age', 'finalWorth', 'year', 'month', 'category',
       'source', 'country', 'state', 'city', 'countryOfCitizenship',
       'organization', 'selfMade', 'gender', 'birthDate', 'title',
       'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about'],
      dtype='object')

In [4]:
# drop unnecessary columns ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio', 'about', 'title']
cols = ['year', 'month', 'countryOfCitizenship', 'source', 'state', 'city', 'organization', 'philanthropyScore', 'residenceMsa', 'numberOfSiblings', 'bio',
       'about', 'title', 'birthDate']

df = df.drop(columns=cols)
df.head()

Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M


In [5]:
# Check data types
df.dtypes


rank            int64
personName     object
age           float64
finalWorth      int64
category       object
country        object
selfMade         bool
gender         object
dtype: object

In [6]:
# count columns
df.count()

rank          2668
personName    2668
age           2582
finalWorth    2668
category      2668
country       2668
selfMade      2668
gender        2652
dtype: int64

In [7]:
# check for null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")


Column rank has 0 null values
Column personName has 0 null values
Column age has 86 null values
Column finalWorth has 0 null values
Column category has 0 null values
Column country has 0 null values
Column selfMade has 0 null values
Column gender has 16 null values


In [8]:
# drop NaN values
df_cleaned = df.dropna()
df_cleaned


Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,True,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,True,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,True,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,True,F


In [9]:
# check for duplicates
df_cleaned.drop_duplicates()

# Note: when comparing drop_duplicates dataframe with the above dataframe,
# there are the same number of rows - there are no duplicates in the data.


Unnamed: 0,rank,personName,age,finalWorth,category,country,selfMade,gender
0,1,Elon Musk,50.0,219000,Automotive,United States,True,M
1,2,Jeff Bezos,58.0,171000,Technology,United States,True,M
2,3,Bernard Arnault & family,73.0,158000,Fashion & Retail,France,False,M
3,4,Bill Gates,66.0,129000,Technology,United States,True,M
4,5,Warren Buffett,91.0,118000,Finance & Investments,United States,True,M
...,...,...,...,...,...,...,...,...
2663,2578,Zhang Yuqiang,66.0,1000,Manufacturing,China,True,M
2664,2578,Zhou Ruxin,59.0,1000,Technology,China,True,M
2665,2578,Wen Zhou & family,57.0,1000,Manufacturing,China,True,M
2666,2578,Zhou Yifeng & family,43.0,1000,Energy,China,True,F


In [10]:
# saving cleaned data to csv

file_path = "Resources/cleaned_billionaire.csv"
df_cleaned.to_csv(file_path, index=False)

