In [55]:
import pandas as pd
import numpy as np

Create a data frame from the file celebrity_deaths_2016.csv. For this exercise, we’ll use only two columns:
- dateofdeath
- age

In [56]:
path = '../../pandas-workout-data/data/celebrity_deaths_2016.csv'
columns = ['dateofdeath', 'age']

In [57]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)
df.head(5)

Unnamed: 0,dateofdeath,age
0,2016-01-01,71
1,2016-01-01,74
2,2016-01-01,79
3,2016-01-01,45
4,2016-01-01,83


In [58]:
df.tail(5)

Unnamed: 0,dateofdeath,age
6538,2016-12-27,74
6539,2016-12-27,85
6540,2016-12-27,83
6541,2016-12-27,23
6542,2016-12-27,84


Create a new month column containing the month from the dateofdeath column.

In [59]:
df['month'] = df['dateofdeath'].str.slice(5,7)
df.tail(5)

Unnamed: 0,dateofdeath,age,month
6538,2016-12-27,74,12
6539,2016-12-27,85,12
6540,2016-12-27,83,12
6541,2016-12-27,23,12
6542,2016-12-27,84,12


In [60]:
df.dtypes

dateofdeath    object
age            object
month          object
dtype: object

Make the month column the index of the data frame.

In [61]:
df = df.set_index('month')
df.head(3)

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-01,74
1,2016-01-01,79


Sort the data frame by the index.

In [62]:
df = df.sort_index()

In [63]:
df.head(5)

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-21,47
1,2016-01-21,87
1,2016-01-21,90
1,2016-01-21,73


In [64]:
df.index

Index(['01', '01', '01', '01', '01', '01', '01', '01', '01', '01',
       ...
       '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'],
      dtype='object', name='month', length=6543)

In [65]:
df.index.is_monotonic_increasing


True

Clean all nonintegers from the age column.

In [66]:
len(df.index) - df['age'].count()

np.int64(27)

In [67]:
df = df.dropna(
    subset=['age']
)

In [68]:
len(df.index) - df['age'].count()

np.int64(0)

In [69]:
df[~df['age'].str.isdigit()].head(5) # str.isdigit method, which returns True if a string contains only digits (and isn’t empty). (It returns False if there is a – sign or decimal point, so it’s not a failsafe for finding numbers, but it will work with age

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-07,8889
2,2016-02-20,6869
2,2016-02-29,3031
2,2016-02-15,5253
2,2016-02-08,4445


In [70]:
df.loc[df['age'].str.contains('8889', na=False), 'age'].apply(list)

month
01    [ , 8, 8, 8, 9]
Name: age, dtype: object

In [71]:
df.loc[df['age'].str.contains('8889', na=False), 'age'].values

array([' 8889'], dtype=object)

In [72]:
df.loc[df['age'].str.contains('8889', na=False), 'age'].to_numpy()

array([' 8889'], dtype=object)

By default, pd.to_numeric will raise an exception if it encounters a string that cannot be turned into an int or float. But if we pass the keyword argument errors='coerce', it will turn any values it can’t convert into NaN. We can thus ignore all use of str.isdigit and simply say

In [73]:
df['age'] = pd.to_numeric(df['age'], errors='coerce')

In [74]:
df.dtypes

dateofdeath     object
age            float64
dtype: object

In [75]:
df.loc[df['age'].isnull()]

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
4,2016-04-12,
5,2016-05-21,
7,2016-07-22,
7,2016-07-13,
8,2016-08-20,
8,2016-08-30,
8,2016-08-09,
10,2016-10-21,
10,2016-10-20,
10,2016-10-29,


In [76]:
df['age'].describe()

count    6505.000000
mean      100.960338
std       413.994127
min         7.000000
25%        69.000000
50%        81.000000
75%        89.000000
max      9394.000000
Name: age, dtype: float64

I don’t know about you, but a mean age of 100 seems suspicious. And a maximum age of 9,394 seems a bit high, even if you exercise regularly. This is the result of a string containing the value '9394', which pd.to_numeric happily converted into a number

In [77]:
df = df[df['age'] < 120]

Find the average age of celebrities who died during that period. February–July 2016.

In [78]:
df.loc['02':'07', 'age']

month
02    69.0
02    82.0
02    82.0
02    84.0
02    88.0
      ... 
07    45.0
07    57.0
07    78.0
07    82.0
07    99.0
Name: age, Length: 3304, dtype: float64

In [79]:
df.loc['02':'07', 'age'].mean()

np.float64(77.17887409200968)

### Beyond the exercise

Add a new column, day, from the day of the month in which the celebrity died. Then create a multi-index (from month and day). What was the average age of death from Feb. 15 through July 15?

In [80]:
df

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
01,2016-01-01,71.0
01,2016-01-21,47.0
01,2016-01-21,87.0
01,2016-01-21,90.0
01,2016-01-21,73.0
...,...,...
12,2016-12-10,63.0
12,2016-12-10,20.0
12,2016-12-10,57.0
12,2016-12-10,78.0


In [81]:
df['day'] = df['dateofdeath'].str.slice(8,None)
df

Unnamed: 0_level_0,dateofdeath,age,day
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01,2016-01-01,71.0,01
01,2016-01-21,47.0,21
01,2016-01-21,87.0,21
01,2016-01-21,90.0,21
01,2016-01-21,73.0,21
...,...,...,...
12,2016-12-10,63.0,10
12,2016-12-10,20.0,10
12,2016-12-10,57.0,10
12,2016-12-10,78.0,10


In [82]:
df['day'].dtype

dtype('O')

In [83]:
df.loc[df['day'].isnull()]

Unnamed: 0_level_0,dateofdeath,age,day
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [84]:
df = df.reset_index()

In [85]:
df.head(5)

Unnamed: 0,month,dateofdeath,age,day
0,1,2016-01-01,71.0,1
1,1,2016-01-21,47.0,21
2,1,2016-01-21,87.0,21
3,1,2016-01-21,90.0,21
4,1,2016-01-21,73.0,21


In [86]:
df = df.set_index(['month', 'day'])
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,dateofdeath,age
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2016-01-01,71.0
1,21,2016-01-21,47.0
1,21,2016-01-21,87.0
1,21,2016-01-21,90.0
1,21,2016-01-21,73.0


In [87]:
df = df.sort_index(level=['month', 'day'])

In [88]:
df.index.is_monotonic_increasing

True

In [89]:
df.loc[('02','15'):('07','15'), 'age'].mean()

np.float64(77.05183037332367)

The CSV file contains another column, causeofdeath. Load that into a data frame, and find the five most common causes of death. Now replace any NaN values in that column with the string 'unknown', and again find the five most common causes of death.

In [90]:
columns = ['dateofdeath', 'age', 'causeofdeath']

In [91]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)

In [92]:
df.head(5)

Unnamed: 0,dateofdeath,age,causeofdeath
0,2016-01-01,71,brain cancer
1,2016-01-01,74,cancer
2,2016-01-01,79,cancer
3,2016-01-01,45,complications of a stroke
4,2016-01-01,83,heart failure


In [93]:
df['causeofdeath'].value_counts().head(5)

causeofdeath
cancer               248
heart attack         125
traffic collision     56
lung cancer           51
pneumonia             50
Name: count, dtype: int64

In [94]:
df['causeofdeath'].isnull().sum()

np.int64(5008)

In [101]:
df['causeofdeath'] = df['causeofdeath'].replace(np.nan, 'unknown')
# df['causeofdeath'] = df['causeofdeath'].fillna('unknown')

In [102]:
df['causeofdeath'].isnull().sum()

np.int64(0)

In [104]:
df['causeofdeath'].value_counts().head(5)

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
Name: count, dtype: int64

If someone asked whether cancer is in the top 10 causes, what would you say? Can you be more specific than that?

In [105]:
# we see that there is general "cancer," but also "lung cancer" and "pancreatic cancer."

# It's impossible to know whether just "cancer" means "other cancer," or that it wasn't
# classified well, or somethign else.

# Basically, this is an instructive data set because it is not very reliable, at least
# when it comes to causes of death. We would want something more rigorous in making serious decisions.
df['causeofdeath'].value_counts().head(10)

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
 pneumonia              50
 heart failure          49
 shot                   42
 stroke                 36
 pancreatic cancer      35
Name: count, dtype: int64