### Imports

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

### Notebook Presentation

In [4]:
pd.options.display.float_format = '{:,.2f}'.format

# Create locators for ticks on the time axis


from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

### Read the Data

In [6]:
df_yearly = pd.read_csv('annual_deaths_by_clinic.csv')
# parse_dates avoids DateTime conversion later
df_monthly = pd.read_csv('monthly_deaths.csv', 
                      parse_dates=['date'])

## Data Exploration

In [10]:
df_yearly.sample(3)

Unnamed: 0,year,births,deaths,clinic
6,1841,2442,86,clinic 2
9,1844,2956,68,clinic 2
1,1842,3287,518,clinic 1


In [11]:
df_monthly.sample(3)

Unnamed: 0,date,births,deaths
30,1843-08-01,193,3
44,1844-10-01,248,8
28,1843-06-01,196,8


In [12]:
print(df_yearly.shape)
print(df_yearly.columns)
print(df_yearly.dtypes)

(12, 4)
Index(['year', 'births', 'deaths', 'clinic'], dtype='object')
year       int64
births     int64
deaths     int64
clinic    object
dtype: object


In [13]:
print(df_monthly.shape)
print(df_monthly.columns)
print(df_monthly.dtypes)

(98, 3)
Index(['date', 'births', 'deaths'], dtype='object')
date      datetime64[ns]
births             int64
deaths             int64
dtype: object


In [14]:
df_yearly["year"].unique()

array([1841, 1842, 1843, 1844, 1845, 1846], dtype=int64)

In [33]:
df_monthly["date"].dt.year.unique()

array([1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849])

### NaN values and duplicates

In [37]:
df_yearly.duplicated().any()

False

In [39]:
df_monthly.duplicated().any()

False

In [59]:
df_yearly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    12 non-null     int64 
 1   births  12 non-null     int64 
 2   deaths  12 non-null     int64 
 3   clinic  12 non-null     object
dtypes: int64(3), object(1)
memory usage: 516.0+ bytes


In [61]:
df_monthly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    98 non-null     datetime64[ns]
 1   births  98 non-null     int64         
 2   deaths  98 non-null     int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 2.4 KB


### Descriptive statistics for Deaths and Births

In [56]:
df_monthly.describe()

Unnamed: 0,date,births,deaths
count,98,98.0,98.0
mean,1845-02-11 04:24:29.387755008,267.0,22.47
min,1841-01-01 00:00:00,190.0,0.0
25%,1843-02-08 00:00:00,242.5,8.0
50%,1845-02-15 00:00:00,264.0,16.5
75%,1847-02-22 00:00:00,292.75,36.75
max,1849-03-01 00:00:00,406.0,75.0
std,,41.77,18.14
