In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.options.display.float_format = '{.1f}'.format

In [2]:
df = pd.read_csv('US_baby_names.csv')
print(df.shape)
df

(2052781, 4)


Unnamed: 0,Year,Names,Gender,Count
0,1880,Mary,F,7065
1,1880,Anna,F,2604
2,1880,Emma,F,2003
3,1880,Elizabeth,F,1939
4,1880,Minnie,F,1746
...,...,...,...,...
2052776,2021,Zyeire,M,5
2052777,2021,Zyel,M,5
2052778,2021,Zyian,M,5
2052779,2021,Zylar,M,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052781 entries, 0 to 2052780
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Year    int64 
 1   Names   object
 2   Gender  object
 3   Count   int64 
dtypes: int64(2), object(2)
memory usage: 62.6+ MB


In [4]:
df.nunique()

Year         142
Names     101338
Gender         2
Count      13854
dtype: int64

In [5]:
# Because gender has only two values we opt to change it to 'category'

df.Gender = df.Gender.astype('category')

In [6]:
# This step made the memory reduce from 62.6 mb to 48.9 mb

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052781 entries, 0 to 2052780
Data columns (total 4 columns):
 #   Column  Dtype   
---  ------  -----   
 0   Year    int64   
 1   Names   object  
 2   Gender  category
 3   Count   int64   
dtypes: category(1), int64(2), object(1)
memory usage: 48.9+ MB


In [7]:
# Inspecting the most popular names in 2021

df_2021 = df[df.Year == 2021].copy()
df_2021

Unnamed: 0,Year,Names,Gender,Count
2021244,2021,Olivia,F,17728
2021245,2021,Emma,F,15433
2021246,2021,Charlotte,F,13285
2021247,2021,Amelia,F,12952
2021248,2021,Ava,F,12759
...,...,...,...,...
2052776,2021,Zyeire,M,5
2052777,2021,Zyel,M,5
2052778,2021,Zyian,M,5
2052779,2021,Zylar,M,5


In [8]:
df_2021[df_2021.Gender == 'F'].nlargest(10, 'Count')

Unnamed: 0,Year,Names,Gender,Count
2021244,2021,Olivia,F,17728
2021245,2021,Emma,F,15433
2021246,2021,Charlotte,F,13285
2021247,2021,Amelia,F,12952
2021248,2021,Ava,F,12759
2021249,2021,Sophia,F,12496
2021250,2021,Isabella,F,11201
2021251,2021,Mia,F,11096
2021252,2021,Evelyn,F,9434
2021253,2021,Harper,F,8388


In [9]:
df_2021[df_2021.Gender == 'M'].nlargest(10, 'Count')

Unnamed: 0,Year,Names,Gender,Count
2038788,2021,Liam,M,20272
2038789,2021,Noah,M,18739
2038790,2021,Oliver,M,14616
2038791,2021,Elijah,M,12708
2038792,2021,James,M,12367
2038793,2021,William,M,12088
2038794,2021,Benjamin,M,11791
2038795,2021,Lucas,M,11501
2038796,2021,Henry,M,11307
2038797,2021,Theodore,M,9535


In [10]:
# Defining a function to choose easy the names and gender of each year

def most_commom(year, gender, n):
    return df[(df.Year == year) & (df.Gender == gender)].nlargest(n, 'Count')

In [11]:
most_commom(2010, 'F', 10)

Unnamed: 0,Year,Names,Gender,Count
1658307,2010,Isabella,F,22925
1658308,2010,Sophia,F,20648
1658309,2010,Emma,F,17354
1658310,2010,Olivia,F,17030
1658311,2010,Ava,F,15436
1658312,2010,Emily,F,14278
1658313,2010,Abigail,F,14250
1658314,2010,Madison,F,13189
1658315,2010,Chloe,F,11757
1658316,2010,Mia,F,10644


Most commons names that stands the test of time

In [12]:
most_commom(2021, 'F', 10)

Unnamed: 0,Year,Names,Gender,Count
2021244,2021,Olivia,F,17728
2021245,2021,Emma,F,15433
2021246,2021,Charlotte,F,13285
2021247,2021,Amelia,F,12952
2021248,2021,Ava,F,12759
2021249,2021,Sophia,F,12496
2021250,2021,Isabella,F,11201
2021251,2021,Mia,F,11096
2021252,2021,Evelyn,F,9434
2021253,2021,Harper,F,8388


In [13]:
most_commom(1880, 'F', 10)

Unnamed: 0,Year,Names,Gender,Count
0,1880,Mary,F,7065
1,1880,Anna,F,2604
2,1880,Emma,F,2003
3,1880,Elizabeth,F,1939
4,1880,Minnie,F,1746
5,1880,Margaret,F,1578
6,1880,Ida,F,1472
7,1880,Alice,F,1414
8,1880,Bertha,F,1320
9,1880,Sarah,F,1288


In [14]:
f_2021 = most_commom(2021, 'F', 50)
f_1880 = most_commom(1880, 'F', 50)

In [15]:
f_2021[['Names', 'Count']].merge(f_1880[['Names', 'Count']], how='inner', on='Names')

Unnamed: 0,Names,Count_x,Count_y
0,Emma,15433,2003
1,Elizabeth,7190,1939
2,Ella,6987,1156
3,Grace,5486,982
4,Lucy,4433,590


In [16]:
f_2021[['Names', 'Count']].merge(f_1880[['Names', 'Count']], how='inner', on='Names', suffixes=['_2021', '_1880'])

Unnamed: 0,Names,Count_2021,Count_1880
0,Emma,15433,2003
1,Elizabeth,7190,1939
2,Ella,6987,1156
3,Grace,5486,982
4,Lucy,4433,590


In [17]:
m_2021 = most_commom(2021, 'M', 50)
m_1880 = most_commom(1880, 'M', 50)

In [18]:
m_2021[['Names', 'Count']].merge(m_1880[['Names', 'Count']], how='inner', on='Names', suffixes=['_2021', '_1880'])

Unnamed: 0,Names,Count_2021,Count_1880
0,James,12367,5927
1,William,12088,9532
2,Benjamin,11791,490
3,Henry,11307,2444
4,Daniel,9066,643
5,Michael,9041,354
6,Samuel,8501,1024
7,Jacob,8397,404
8,John,8130,9655
9,Joseph,8067,2632


Data aggregation

In [28]:
df = pd.read_csv('US_baby_names.csv')

In [29]:
df.groupby(['Names', 'Gender']).Count.sum().reset_index(level= -1)

Unnamed: 0_level_0,Gender,Count
Names,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaban,M,120
Aabha,F,51
Aabid,M,16
Aabidah,F,5
Aabir,M,10
...,...,...
Zyvion,M,5
Zyvon,M,7
Zyyanna,F,6
Zyyon,M,6


In [31]:
df_agg = df.groupby(['Names', 'Gender']).agg(Total = ('Count', 'sum'),
                                             Num_of_years = ('Year', 'count'),
                                             First_year = ('Year', 'min'),
                                             Last_year = ('Year', 'max'),
                                             Max_count = ('Count', 'max'))

In [32]:
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Num_of_years,First_year,Last_year,Max_count
Names,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aaban,M,120,12,2007,2019,16
Aabha,F,51,8,2011,2021,9
Aabid,M,16,3,2003,2018,6
Aabidah,F,5,1,2018,2018,5
Aabir,M,10,2,2016,2018,5
...,...,...,...,...,...,...
Zyvion,M,5,1,2009,2009,5
Zyvon,M,7,1,2015,2015,7
Zyyanna,F,6,1,2010,2010,6
Zyyon,M,6,1,2014,2014,6


In [33]:
# Finding the best year of each name

def best_year(group):
    return group.nlargest(1, 'Count').Year

In [34]:
best_y = df.groupby(['Names', 'Gender']).apply(best_year)

In [35]:
best_y

Names    Gender         
Aaban    M       1818089    2014
Aabha    F       1804339    2014
Aabid    M       1954380    2018
Aabidah  F       1941175    2018
Aabir    M       1890855    2016
                            ... 
Zyvion   M       1658306    2009
Zyvon    M       1856415    2015
Zyyanna  F       1675285    2010
Zyyon    M       1824707    2014
Zzyzx    M       1692395    2010
Name: Year, Length: 112620, dtype: int64

In [42]:
best_y.droplevel(2)

Names    Gender
Aaban    M         2014
Aabha    F         2014
Aabid    M         2018
Aabidah  F         2018
Aabir    M         2016
                   ... 
Zyvion   M         2009
Zyvon    M         2015
Zyyanna  F         2010
Zyyon    M         2014
Zzyzx    M         2010
Name: Year, Length: 112620, dtype: int64

In [43]:
df_agg['Best_Year'] = best_y.droplevel(2)

In [44]:
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Num_of_years,First_year,Last_year,Max_count,Best_Year
Names,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaban,M,120,12,2007,2019,16,2014
Aabha,F,51,8,2011,2021,9,2014
Aabid,M,16,3,2003,2018,6,2018
Aabidah,F,5,1,2018,2018,5,2018
Aabir,M,10,2,2016,2018,5,2016
...,...,...,...,...,...,...,...
Zyvion,M,5,1,2009,2009,5,2009
Zyvon,M,7,1,2015,2015,7,2015
Zyyanna,F,6,1,2010,2010,6,2010
Zyyon,M,6,1,2014,2014,6,2014


In [45]:
df_agg.reset_index(inplace=True)

In [46]:
df_agg

Unnamed: 0,Names,Gender,Total,Num_of_years,First_year,Last_year,Max_count,Best_Year
0,Aaban,M,120,12,2007,2019,16,2014
1,Aabha,F,51,8,2011,2021,9,2014
2,Aabid,M,16,3,2003,2018,6,2018
3,Aabidah,F,5,1,2018,2018,5,2018
4,Aabir,M,10,2,2016,2018,5,2016
...,...,...,...,...,...,...,...,...
112615,Zyvion,M,5,1,2009,2009,5,2009
112616,Zyvon,M,7,1,2015,2015,7,2015
112617,Zyyanna,F,6,1,2010,2010,6,2010
112618,Zyyon,M,6,1,2014,2014,6,2014


In [47]:
df_agg[(df_agg.Names == 'Erik') & (df_agg.Gender == 'M')]

Unnamed: 0,Names,Gender,Total,Num_of_years,First_year,Last_year,Max_count,Best_Year
33198,Erik,M,154843,108,1911,2021,4918,1980
