In [1]:
import pandas as pd
import plotly.express as px

# 6 Working with DataFrame Using pandas

In [3]:
# read the file as a pd.DataFrame object
baby = pd.read_csv('data/babynames20102022.csv')
baby.head()

Unnamed: 0,Name,Sex,Count,Year
0,Olivia,F,16573,2022
1,Emma,F,14435,2022
2,Charlotte,F,12891,2022
3,Amelia,F,12333,2022
4,Sophia,F,12310,2022


## 6.1 Subsetting

In [None]:
# slicing
# .loc lets us select rows and columns using their laabels
# Notice that .loc needs square brackets.
baby.loc[1,'Name']

In [None]:
# slice out multiple rows and columns
baby.loc[1:3,'Name':'Count']

In [None]:
# slice all rows and columns
baby.loc[:,:]

In [None]:
# DataFrame and Series
# a pd.DataFrame is two-dimensional, it has rows and columns.
# a pd.Series is one-dimensional, it represents a list of data.
print('### This is a Series. ###')
count = baby.loc[0:5,'Count']
print(count)
count.__class__.__name__

# pass a list into .loc to select DataFrame
print('### This is a DataFrame. ###')
baby.loc[0:5,['Name','Sex']]

In [None]:
# This is a shorthand.
print(baby['Name'])
baby[['Name']]

In [None]:
# Using .iloc, it uses the positions of rows and columns.
baby.iloc[0:3,0:1]


In [None]:
# Filtering rows
# bool comparison
baby['Name'] == 'Emma'

In [None]:
# a faster way
baby.loc[baby['Name'] == 'Emma',:].head(5)

In [None]:
# a shorthand
baby[baby['Name'] == 'Emma'].head(5)

In [None]:
# Using parentheses to make it more readable.
(baby[baby['Name'] == 'Emma']
 .sort_values('Count',ascending=False)
 .head(2))

### 6.1.5 Example: When did Luna became a popular name?

In [None]:
luna = baby[baby['Name'] == 'Luna']
luna = luna[luna['Sex'] == 'F']
luna = luna[['Count','Year']]
luna.head()

In [None]:
px.line(luna,x='Year', y='Count')

In [None]:
siri = (baby.query("Name == 'Siri'").query("Sex == 'F'"))
siri.head()

In [None]:
px.line(siri,x='Year',y='Count').add_vline(x=2011)

# 6.2 Aggregating

In [None]:
baby

In [None]:
# sum()
baby['Count'].sum()


In [None]:
# groupby()
baby.groupby('Year')['Count'].sum()

In [None]:
count_by_year = baby.groupby('Year')['Count'].sum().reset_index()
# rest_index()能将series转换为DataFrame
count_by_year.head()

In [None]:
# 使用px画图时传入的数据集应该是DataFrame格式
px.line(count_by_year,x='Year',y='Count')

In [None]:
# value_counts()
baby['Name'].value_counts()

In [None]:
# Grouping on multiple column
counts_by_year_and_sex = baby.groupby(['Year','Sex'])['Count'].sum()
counts_by_year_and_sex.head()


In [None]:
# reset_index()既能重新排index，也能转换为DataFrame
counts_by_year_and_sex.reset_index().head()

In [None]:
# 对分组后的数据使用更多的函数处理
baby.groupby('Year')['Count'].max()

In [None]:
# 使用自己定义的函数处理
def data_range(count):
    return count.max() - count.min()

baby.groupby('Year')['Count'].agg(data_range)

In [None]:
def count_unique(count):
    return len(count.unique())

unique_names = baby.groupby('Year')['Name'].agg(count_unique)
unique_names.head()

In [None]:
# 作出名字数量的折线图
px.line(unique_names.reset_index(),x='Year',y='Name')

In [None]:
# 表单转换
counts_by_year_and_sex.head()

# pivot_table
mf_pivot = pd.pivot_table(baby,
                          index='Year',
                          columns='Sex',
                          values='Count',
                          aggfunc=sum)
mf_pivot.head()

In [None]:
# plotly
# 作图时默认以index为横轴，value为纵轴
fig = px.line(mf_pivot)
fig


In [None]:
fig.update_traces(selector=1,line_dash='dashdot')

fig.update_yaxes(title='sex Value')

## 6.3 Joining

In [7]:
# 两个小数据集
nyt_small = pd.read_csv('data/nyt_small.csv')
nyt_small

Unnamed: 0,nyt_name,category
0,Karen,boomer
1,Julius,mythology
2,Freya,mythology


In [8]:
baby_small = pd.read_csv('data/baby_small.csv')
baby_small

Unnamed: 0,Name,Sex,Count,Year
0,Noah,M,18252,2020
1,Julius,M,960,2020
2,Karen,M,6,2020
3,Karen,F,325,2020
4,Noah,F,305,2020


In [10]:
# .merge()
# Only the rows with a match in both tables stay in the final result.
baby_small.merge(nyt_small,
                 left_on="Name",
                 right_on="nyt_name")

Unnamed: 0,Name,Sex,Count,Year,nyt_name,category
0,Julius,M,960,2020,Julius,mythology
1,Karen,M,6,2020,Karen,boomer
2,Karen,F,325,2020,Karen,boomer


In [12]:
# left join
baby_small.merge(nyt_small,
                 left_on="Name",
                 right_on="nyt_name",
                 how='left')

Unnamed: 0,Name,Sex,Count,Year,nyt_name,category
0,Noah,M,18252,2020,,
1,Julius,M,960,2020,Julius,mythology
2,Karen,M,6,2020,Karen,boomer
3,Karen,F,325,2020,Karen,boomer
4,Noah,F,305,2020,,


In [13]:
# right join
baby_small.merge(nyt_small,
                 left_on="Name",
                 right_on="nyt_name",
                 how='right')

Unnamed: 0,Name,Sex,Count,Year,nyt_name,category
0,Karen,M,6.0,2020.0,Karen,boomer
1,Karen,F,325.0,2020.0,Karen,boomer
2,Julius,M,960.0,2020.0,Julius,mythology
3,,,,,Freya,mythology


### 6.3.3 Example: Popularity of NYT name categories

In [15]:
nyt = pd.read_csv('data/nyt.csv')
nyt.head()

Unnamed: 0,nyt_name,category
0,Aurelia,gods
1,Calliope,gods
2,Freya,gods
3,Maryam,gods
4,Rhea,gods
