In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
import numpy as np
import seaborn as sns

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from IPython.display import display

import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

**Import Data**

In [None]:
data=pd.read_csv("/kaggle/input/500-richest-people-2021/500 richest people 2021.csv",delimiter=";")
display(data.head())
display(data.info())

As we can see, this data need a lot of cleaning, there are null values, null rows, and more importantly special characters present in the column "Total Net Worth" which will cause problems when we are trying to plot the values.

__Data Cleaning__

In [None]:
data.drop(["Unnamed: 7","Unnamed: 8","Unnamed: 9","Unnamed: 10"],inplace=True,axis=1) #Drop unname columns at the end.
data.drop([499,500,501,502],inplace=True,axis=0) #Drop null rows at the end.
data.fillna(0,inplace=True,axis=1) #Replace null values with 0.
data=data.replace(['\$'," "],'',regex=True) #Remove special characters and unwanted spaces.
data=data.replace("Canda","Canada",regex=True) #In some rows, the country "Canada" was wrongly mentioned as "canda", so had to change that.
data["Total Net Worth"]=data["Total Net Worth"].replace("B","",regex=True) #Remove the billion symbol "B".
data["Total Net Worth"]=pd.to_numeric(data["Total Net Worth"],errors="coerce") #Convert the net worth column from string to float.
display(data.head())

# EDA

Now that we have cleaned the data, let's plot the generalised **Total Net Worth** for all the people on the list.

In [None]:
#Total Net Worth
TNW = go.Figure(make_subplots(rows=1, cols=2,column_widths=[0.75, 0.25],horizontal_spacing=0.1,specs=[[{}, {}]],
                      subplot_titles=("Line Plot", "Violin Plot","Dist Plot"))
                 )
TNW.add_trace(
    go.Scatter(x=data["Rank"],y=data["Total Net Worth"],
               mode="lines",
               name="Line Plot",
               marker = dict(color = 'cyan')),
    row=1,col=1
)
TNW.add_trace(
    go.Violin(y=data['Total Net Worth'], box_visible=True, line_color='black',meanline_visible=True, fillcolor='cyan', opacity=0.6,name=" "),
    row=1,col=2
)
TNW.update_xaxes(title_text="Rank", row=1, col=1)
TNW.update_xaxes(title_text="Rank", row=1, col=2)
TNW.update_yaxes(title_text="Net Worth ($B)", row=1, col=1)
TNW.update_yaxes(title_text="Net Worth ($B)", row=1, col=2)

TNW.update_layout(
    title_text="Total Net Worth Of Billionaires",
    paper_bgcolor='grey',
    margin=dict(l=20,r=10,b=50,t=50),
    height=500,
    width=1200,
)
TNW.show()

#Distplot
plt.figure(figsize=(20,6))
sns.distplot(data["Total Net Worth"],color="cyan")
plt.title("Distribution Plot")
plt.show()

print("Percentage of people having net worth below $50B: {}%".format(round(((len(data[data["Total Net Worth"] < 50]) / len(data))*100),2)))

The three plots tell us the same story, that most people on the list (94.59%) have a net worth less than $50B.

Which means that out of the 500 people on the list, 474 have a net worth below $50B. So it takes a lot of hard-work to get over that mark and only a few (26 people) have been able to do it.

**Namely, only the following people have a net worth above $50B:** 

**How cool is it to be among the richest 5% in the world🤑🤯**



In [None]:
display(data[data["Total Net Worth"]>=50])

**Let's plot more specific data now**

First we will plot the 15 richest people in the world with their total net worth and country:

In [None]:
#15 richest people in the world
print("This barplot shows the top 15 richest people in the world\n")
X=data["Total Net Worth"].head(15)
Y=data["Name"].head(15)
Z=data["Country"].head(15)
plt.figure(figsize=(20,8))
A=sns.barplot(data=data,x=Y,y=X,color="red",palette="Wistia_r",saturation=0.7)
plt.plot(Y,X,color="lightyellow")
plt.ylabel("Net worth in ($Billion)")
plt.xticks(rotation=30)
plt.title("15 richest people in 2021")
plt.tight_layout()
plt.show()

**Which industry has the most Billionaires??**

1. The bar plot tells us that working in the **Technology** industry has the most chance of making you rich. Upto **16.6%** of the people in the top 500 list work in the Technology industry followed by **11.2%** in the Industrial industry. 

2. The joint plot can be used to quickly visualize and analyze the relationship between two variables and describe their individual distributions on the same plot. Along with the scatter plot of the industry wise net worth, this plot also displays the bar plot for the same on the right, along with the distribution plot of the net worths on the top.

In [None]:
#Industry Wise
display(data["Industry"].value_counts())
print("\n\nClassifying billionaires on the basis of the Industry they work in: \n\n")
Ind=data["Industry"].value_counts()
plt.figure(figsize=(20,7))
B=sns.barplot(data=data,x=Ind.index,y=Ind,palette="flare_r",saturation=1)
B.spines['bottom'].set_linewidth(1.5)
for w in ['right', 'top', 'left']:
    B.spines[w].set_visible(False)
plt.xlabel("Industry")
plt.xticks(rotation=10)
plt.ylabel("Count")
plt.title("Richest people Industry Wise")

k=0
for p in B.patches:
    height = p.get_height()
    plt.text((p.get_x() + (p.get_width()/2-0.2)),height+2, "{}%".format(round((Ind[k]/5),2)),fontname = 'monospace', fontsize = 18, color = 'violet') 
    k+=1
plt.show()

#Joint Plot
print("                                        Joint Plot: \n")
sns.jointplot(data=data,y="Industry",x="Total Net Worth",color="red",height=8,ratio=3)
plt.show()

**Which country has the most billionaires??**

1. The bar plot displays the top 10 countries with most billionaires. Not surprised to see the Unites States on top with 157 among the 500, followed by China with 77 billionaires.

2. The pie plot clearly shows that USA has dominated most of the list with 41% American people in the top 10 country billionaires.

In [None]:
#Country Wise
display(data["Country"].value_counts().head(10))

country=data["Country"].value_counts().head(10)
plt.figure(figsize=(20,7))
C=sns.barplot(data=data,x=country.index,y=country,color="#00FFFF")
C.spines['bottom'].set_linewidth(1.5)
for w in ['right', 'top', 'left']:
    C.spines[w].set_visible(False)
plt.xlabel("Country")
plt.ylabel("Count")
plt.title("Countries with most billionaires")
#Text on bars
k=0
for p in C.patches:
    height = p.get_height()
    plt.text((p.get_x() + (p.get_width()/2-0.1)),height+2, country[k],fontname = 'monospace', fontsize = 18, color = '#98FB98') 
    k+=1
plt.show()

country=data["Country"].value_counts().head(10).values
name=data["Country"].value_counts().head(10).index
plt.pie(country,radius=2,labels=name,rotatelabels=True,shadow=True,autopct='%1.1f%%',pctdistance=0.8,explode=[0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05])
plt.title("Top 10 countries",color="black",fontweight="bold")
plt.legend(bbox_to_anchor=(1.5, 1.5))
plt.show()

# **Country Wise Analysis**

I have made a interactive function widget to view the overall statistics of any country, just choose the country you wish from the dropdown menu and the data will appear. 

> **NOTE: Because Kaggle does not support realtime updates in it's notebooks, you will have to use the "Copy and edit Notebook" option and run the nootebook for yourself in your own kernel for the Interactive widget to work. Or you can just download the code and run it.**

In [None]:
def f(x):
    display(x)
    return x

C = data["Country"].unique()
P = interactive(f, x=widgets.Dropdown(options=C,value='India',description='Country: ',disabled=False))
print("Select a country to view data:")
display(P)

**1. The bar plot will display the 20 richest people from the selected country along with their world rank and industry they work in.**

**2. The line plot displays the trend of the world rank for the billionares of that respective country.**

**3. The violin plot displays the overall spread of the net worth for the billionaires of that country.**

In [None]:
nation=P.result
print("Richest people in {}".format(nation))
display(data[data["Country"]==nation].reset_index())
X=data[data["Country"]==nation].head(20).reset_index()
plt.figure(figsize=(15,7))
b = sns.barplot(data = X, y = 'Name', x = 'Total Net Worth', color = '#088F8F')
plt.xticks(rotation=45)
plt.xlabel("Net worth in $B")
plt.suptitle("Richest people in {}".format(nation))

b.spines['left'].set_linewidth(3)
for w in ['right', 'top', 'bottom']:
    b.spines[w].set_visible(False)

#Write text on barplots
k=0
for p in b.patches:
    width = p.get_width()
    plt.text(width+1, (p.get_y() + p.get_height()-0.2), X[X.index==k]["Industry"].item(),fontname = 'monospace', fontsize = 14, color = '#088F8F') 
    plt.text(0.5, (p.get_y() + p.get_height()-0.1), "WR:{}".format(X[X.index==k]["Rank"].item()),fontname = 'monospace', fontsize = 14, color = 'black') 
    k+=1
plt.show()

plt.figure(figsize=(18,7))
sns.lineplot(data=X,y="Rank",x="Name",color="#088F8F")
plt.scatter(X["Name"],X["Rank"],lw=5,color="red",label="World Ranking")
plt.suptitle("Richest people in {}".format(nation))
plt.xticks(rotation=90)
plt.ylabel("World Ranking")
plt.legend()
plt.show()

X=data[data["Country"]==nation]
sns.catplot(kind="violin", data=X, y = "Total Net Worth",palette="flare_r")
plt.title("Violin Plot for {}".format(nation))
plt.show()

print("Average Net worth ($B) of {}: {}".format(nation,round(data[data["Country"]==nation]["Total Net Worth"].mean())))

# Classification based on **Country** and **Industry**

Here, i have made another interactive widget what will enable the user to classify the data based on "Country" or "Industry".

In [None]:
def f(x):
    display(x)
    return x

C = ["Industry","Country"]
P = interactive(f, x=widgets.Dropdown(options=C,value="Industry",description='Basis: ',disabled=False))
print("Select a basis for classification:")
display(P)

**1. The first plot shows the scatter plot of the net worth for each industry/country.**

**2. The second plot displays the average net worth of each country/industry along with the highest point marked in red.**

In [None]:
choice=P.result
#Swarm Plot for each country/industry
sns.catplot(kind="swarm", data=data, y = "Total Net Worth",x = choice,height=5,aspect=4)
sns.catplot(kind="point", data=data, y = "Total Net Worth",x = choice,height=5,aspect=4)
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

#Average Net Worth for each country/industry
avg_rank={}
for sel in data[choice].unique():
    avg=round(data[data[choice]==sel]["Total Net Worth"].mean())
    avg_rank.update({sel:avg})
    
highest=max(avg_rank.values())
res = [key for key in avg_rank if avg_rank[key] == highest]

lowest=min(avg_rank.values())
res1 = [key for key in avg_rank if avg_rank[key] == lowest]

Iavg_rank=avg_rank.items()
k,v = zip(*Iavg_rank)
print("The average net worth of each {} is: \n".format(choice))

#Plot the stats
plt.figure(figsize=(18,6))
plt.plot(k,v,color="#088F8F")
plt.scatter(res,highest,color="red",lw=5,label="Highest: {}".format(res))
plt.title("Average Net Worth {} Wise".format(choice))
plt.xticks(rotation=90)
plt.xlabel("{}".format(choice))
plt.ylabel("Average Net Worth ($B) ")

plt.gca().spines['bottom'].set_linewidth(3)
for w in ['right', 'top', 'left']:
    plt.gca().spines[w].set_visible(False)
plt.legend()
plt.grid(True)
plt.show()

**Now we will plot the data for only the top countries and industries for better understanding.**

In [None]:
#Create new dataframe for top countries
I=data[data["Country"]=="India"]
U=data[data["Country"]=="UnitedStates"]
C=data[data["Country"]=="Chile"]
F=data[data["Country"]=="France"]
S=data[data["Country"]=="Spain"]
M=data[data["Country"]=="Mexico"]
J=data[data["Country"]=="Japan"]
CH=data[data["Country"]=="China"]
new_df1=pd.concat([S,F,U,I,M,J])

#Create new dataframe for top industries
T=data[data["Industry"]=="Technology"]
C=data[data["Industry"]=="Consumer"]
R=data[data["Industry"]=="Retail"]
D=data[data["Industry"]=="Diversified"]
I=data[data["Industry"]=="Industrial"]
new_df2=pd.concat([T,C,R,D,I])

print("\n\nTOP COUNTRIES: \n\n")
#Visualize data for top countries
sns.catplot(kind="swarm", data=new_df1, y = "Total Net Worth",x = "Country",height=5,aspect=4,palette="terrain",hue="Industry")
plt.suptitle("TOP 6 COUNTRIES")
sns.catplot(kind="violin", data=new_df1, y = "Total Net Worth",x = "Country",height=5,aspect=4,palette="gist_earth_r")
sns.pairplot(data=new_df1, hue="Country" ,height=6,aspect=2,palette="twilight_r")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

print("\n\nTOP INDUSTRIES: \n\n")
#Visualize data for top Industries
sns.catplot(kind="swarm", data=new_df2, y = "Total Net Worth",x = "Industry",height=5,aspect=4,palette="terrain",hue="Country")
plt.suptitle("TOP 5 INDUSTRIES")
sns.catplot(kind="violin", data=new_df2, y = "Total Net Worth",x = "Industry",height=5,aspect=4,palette="gist_earth_r")
sns.pairplot(data=new_df2, hue="Industry" ,height=6,aspect=2,palette="twilight_r")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Conclusion
1. To have the highest chance of you being a billionaire, you have to be an American individual working in the Technology industry xD

-----------------------------------------------------------------------------------------------------------

**This notebook was basically to show different kinds of visualizations and plots using Seaborn, matplotlib, and Plotly libraries.**

**If you liked the notebook or found it useful, please do upvote.**

**If you have any suggestions or doubts, feel free to comment below.**

# Thank You!