In [17]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
df = pd.read_csv("../input/pantheon-project/database.csv")

In [19]:
df.info()

In [20]:
df.head()

In [21]:
df.set_index("birth_year")

In [22]:
df.set_index("industry")

In [23]:
df.loc[df["continent"]=="Europe"]

In [24]:
df.sort_values(by=["birth_year"])

In [25]:
# dfex.to_csv(r'./dfex.cv', index=False)

In [26]:
df['birth_year'] = (
    pd.to_numeric(df['birth_year'],
                  errors='coerce')
      .fillna(0)
    )

In [27]:
df.info()

In [28]:
sns.kdeplot(df['birth_year'])

In [29]:
sns.jointplot(x='birth_year', y='article_languages', data=df, kind='reg')

In [30]:
sns.countplot(x='continent',data=df)

In [31]:
plt.figure(figsize=(14,9))
sns.countplot(x='country',data=df, order=df.country.value_counts().iloc[:10].index)

In [83]:
df.continent.value_counts()

In [33]:
plt.figure(figsize=(14,9))
sns.countplot(x='domain',data=df, order=df.domain.value_counts().iloc[:10].index)

In [34]:
#There will be multiple eras, in every era, there will be most dominant continent,countries,domains,gender,industries
#avg. popularity index
#which countries are leading in which domains
#influence of domains,continents,countries in overall total historical score and frequency
#can take inspiration from 2 other notebooks written in ggplot and for era analysis on 20th centuary analysis

In [94]:
df.continent.unique()

# So let's start with a brief view on entire dataset! 

In [36]:
sns.jointplot(x='birth_year', y='historical_popularity_index', data=df, hue="domain")

By looking at the above jointplot, we can observe some relations between popularity index, birth-year and domain. We can clearly see the domination of *Instutitions* domain. Also some domains like sports and Exploration are pretty low in popularity index.Also domains like Public figure came into existance pretty late.
Now, let's see how geometry has affected mankind.

In [113]:
def my_fmt(x):
    return '{:.4f}%\n({:.0f})'.format(x, 11341*x/100)

plt.figure(figsize=(14,9))
data = df.continent.value_counts()
labels = ['Europe', 'North America', 'Asia', 'Africa', 'Unknown', 'South America', 'Ocenia']
colors = sns.color_palette('husl')
plt.pie(data, labels=labels,colors = colors, autopct = my_fmt, frame=True)
plt.show()

In [120]:
def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.1f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.1f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)

plt.figure(figsize=(14,9))
p = sns.histplot(data = df, y="continent", hue="domain", palette="hls", multiple="stack", linewidth=0,)
# show_values(p, "h", space=0)



Here we can see that continentwise Europe is totally leading, followed by North America and Asia. Ocenia is at last place as it was discovered very late in history. Domainwise we can see that except for North America, every other continent has *Institutions* as most popular domain, while North America has Arts as leading domain. (Arts includes cinema, crafts, and music as well)

In [135]:
plt.figure(figsize=(14,9))
ax = sns.countplot(x='industry',data=df, order=df.industry.value_counts().iloc[:15].index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

Now you after seeing North America at 3rd place, you might be wondering that you have heard of very few American notable history figures. But you do recognise a ton of American celebrities, that means majority of data in dataset should be from last 2-4 centuries, especially of America's.
Let's check it out.

In [38]:
plt.figure(figsize=(14,9))
sns.histplot(data = df,x="birth_year", hue="continent", palette="magma", multiple="stack", linewidth=0)
plt.xlim (1000,2000)

As seen in the graph, our intuition was absolutely correct. Now let's check genderwise contribution.

In [39]:
plt.figure(figsize=(14,9))
sns.stripplot(x='continent',y='birth_year',data=df, hue='sex', 
             palette='seismic')

The plot suggests that in the dataset, no. of males is significantly higher than no. of females. While (contradictory to popular belief) contribution of females can be seen throught entire timeline except for North America, where it is relatively higher in recent times.