In [25]:
import pandas as pd
import plotly.express as px

In [2]:
# read the file as a pd.DataFrame object
baby = pd.read_csv('data/babynames20102022.csv')
baby

Unnamed: 0,Name,Sex,Count,Year
0,Olivia,F,16573,2022
1,Emma,F,14435,2022
2,Charlotte,F,12891,2022
3,Amelia,F,12333,2022
4,Sophia,F,12310,2022
...,...,...,...,...
426767,Zymaire,M,5,2010
426768,Zyonne,M,5,2010
426769,Zyquarius,M,5,2010
426770,Zyran,M,5,2010


In [3]:
# slicing
# .loc lets us select rows and columns using their laabels
# Notice that .loc needs square brackets.
baby.loc[1,'Name']

'Emma'

In [4]:
# slice out multiple rows and columns
baby.loc[1:3,'Name':'Count']

Unnamed: 0,Name,Sex,Count
1,Emma,F,14435
2,Charlotte,F,12891
3,Amelia,F,12333


In [5]:
# slice all rows and columns
baby.loc[:,:]

Unnamed: 0,Name,Sex,Count,Year
0,Olivia,F,16573,2022
1,Emma,F,14435,2022
2,Charlotte,F,12891,2022
3,Amelia,F,12333,2022
4,Sophia,F,12310,2022
...,...,...,...,...
426767,Zymaire,M,5,2010
426768,Zyonne,M,5,2010
426769,Zyquarius,M,5,2010
426770,Zyran,M,5,2010


In [6]:
# DataFrame and Series
# a pd.DataFrame is two-dimensional, it has rows and columns.
# a pd.Series is one-dimensional, it represents a list of data.
print('### This is a Series. ###')
count = baby.loc[0:5,'Count']
print(count)
count.__class__.__name__

# pass a list into .loc to select DataFrame
print('### This is a DataFrame. ###')
baby.loc[0:5,['Name','Sex']]

### This is a Series. ###
0    16573
1    14435
2    12891
3    12333
4    12310
5    11662
Name: Count, dtype: int64
### This is a DataFrame. ###


Unnamed: 0,Name,Sex
0,Olivia,F
1,Emma,F
2,Charlotte,F
3,Amelia,F
4,Sophia,F
5,Isabella,F


In [7]:
# This is a shorthand.
print(baby['Name'])
baby[['Name']]

0            Olivia
1              Emma
2         Charlotte
3            Amelia
4            Sophia
            ...    
426767      Zymaire
426768       Zyonne
426769    Zyquarius
426770        Zyran
426771        Zzyzx
Name: Name, Length: 426772, dtype: object


Unnamed: 0,Name
0,Olivia
1,Emma
2,Charlotte
3,Amelia
4,Sophia
...,...
426767,Zymaire
426768,Zyonne
426769,Zyquarius
426770,Zyran


In [8]:
# Using .iloc, it uses the positions of rows and columns.
baby.iloc[0:3,0:1]


Unnamed: 0,Name
0,Olivia
1,Emma
2,Charlotte


In [9]:
# Filtering rows
# bool comparison
baby['Name'] == 'Emma'

0         False
1          True
2         False
3         False
4         False
          ...  
426767    False
426768    False
426769    False
426770    False
426771    False
Name: Name, Length: 426772, dtype: bool

In [10]:
# a faster way
baby.loc[baby['Name'] == 'Emma',:].head(5)

Unnamed: 0,Name,Sex,Count,Year
1,Emma,F,14435,2022
23573,Emma,M,15,2022
31916,Emma,F,15510,2021
55646,Emma,M,14,2021
63601,Emma,F,15680,2020


In [11]:
# a shorthand
baby[baby['Name'] == 'Emma'].head(5)

Unnamed: 0,Name,Sex,Count,Year
1,Emma,F,14435,2022
23573,Emma,M,15,2022
31916,Emma,F,15510,2021
55646,Emma,M,14,2021
63601,Emma,F,15680,2020


In [12]:
# Using parentheses to make it more readable.
(baby[baby['Name'] == 'Emma']
 .sort_values('Count',ascending=False)
 .head(2))

Unnamed: 0,Name,Sex,Count,Year
324970,Emma,F,20964,2012
258346,Emma,F,20958,2014


# 6.1.5 Example: When did Luna became a popular name?

In [22]:
luna = baby[baby['Name'] == 'Luna']
luna = luna[luna['Sex'] == 'F']
luna = luna[['Count','Year']]
luna.head()

Unnamed: 0,Count,Year
9,8922,2022
31925,8216,2021
63613,7834,2020
95132,7787,2019
127248,6932,2018


In [35]:
px.line(luna,x='Year', y='Count')

In [34]:
siri = (baby.query("Name == 'Siri'").query("Sex == 'F'"))
siri.head()

Unnamed: 0,Name,Sex,Count,Year
42920,Siri,F,9,2021
73745,Siri,F,10,2020
103644,Siri,F,13,2019
133425,Siri,F,20,2018
165695,Siri,F,20,2017


In [36]:
px.line(siri,x='Year',y='Count').add_vline(x=2011)

# 6.2 Aggregating

In [39]:
baby

Unnamed: 0,Name,Sex,Count,Year
0,Olivia,F,16573,2022
1,Emma,F,14435,2022
2,Charlotte,F,12891,2022
3,Amelia,F,12333,2022
4,Sophia,F,12310,2022
...,...,...,...,...
426767,Zymaire,M,5,2010
426768,Zyonne,M,5,2010
426769,Zyquarius,M,5,2010
426770,Zyran,M,5,2010


In [13]:
# sum()
baby['Count'].sum()


46344455

In [15]:
# groupby()
baby.groupby('Year')['Count'].sum()

Year
2010    3694428
2011    3656309
2012    3655713
2013    3643652
2014    3704186
Name: Count, dtype: int64

In [43]:
count_by_year = baby.groupby('Year')['Count'].sum().reset_index()
# rest_index()能将series转换为DataFrame
count_by_year.head()

Unnamed: 0,Year,Count
0,2010,3694428
1,2011,3656309
2,2012,3655713
3,2013,3643652
4,2014,3704186


In [44]:
# 使用px画图时传入的数据集应该是DataFrame格式
px.line(count_by_year,x='Year',y='Count')

In [47]:
# value_counts()
baby['Name'].value_counts()

Name
Olivia       26
Grey         26
Wylie        26
Royale       26
Jaylyn       26
             ..
Sefton        1
Seph          1
Shaad         1
Shikhar       1
Zyquarius     1
Name: count, Length: 58684, dtype: int64

In [72]:
# Grouping on multiple column
counts_by_year_and_sex = baby.groupby(['Year','Sex'])['Count'].sum()
counts_by_year_and_sex.head()


Year  Sex
2010  F      1776636
      M      1917792
2011  F      1758110
      M      1898199
2012  F      1759897
Name: Count, dtype: int64

In [73]:
# reset_index()既能重新排index，也能转换为DataFrame
counts_by_year_and_sex.reset_index().head()

Unnamed: 0,Year,Sex,Count
0,2010,F,1776636
1,2010,M,1917792
2,2011,F,1758110
3,2011,M,1898199
4,2012,F,1759897


In [74]:
# 对分组后的数据使用更多的函数处理
baby.groupby('Year')['Count'].max()

Year
2010    22929
2011    21856
2012    22325
2013    21244
2014    20958
2015    20478
2016    19536
2017    19857
2018    19957
2019    20601
2020    19828
2021    20365
2022    20456
Name: Count, dtype: int64

In [75]:
baby.groupby('Year')['Count'].min()

Year
2010    5
2011    5
2012    5
2013    5
2014    5
2015    5
2016    5
2017    5
2018    5
2019    5
2020    5
2021    5
2022    5
Name: Count, dtype: int64