## Birthrate Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pylab import rcParams

In [None]:
births=pd.read_csv('births.csv')

In [None]:
births.head()

In [None]:
births.tail()

#### replacing na values with 0

In [None]:
births.isna().sum()

In [None]:
births['day'].fillna(0, inplace=True)

#### changing data type

In [None]:
births['day'] = births['day'].astype(int)
births.info()

#### calculating decade for each year

In [None]:
births['decade']=10*(births['year']//10)

#### gender by decade

creating pivot table

In [None]:
birth_gender=births.pivot_table('births', index='decade', columns='gender', aggfunc='sum')
birth_gender

plotting graph

In [None]:
rcParams['figure.figsize'] = (4,4)  
birth_gender.plot()
plt.ylabel("Total births per year")
plt.show()

#### removing outliers

In [None]:
quartiles = np.percentile(births['births'], [25, 50, 75])
mean = quartiles[1]
sigma = 0.74 * (quartiles[2] - quartiles[0]) #0.74 is interquartile range of normal distribution

use query() to filter out rows with births outside this value

In [None]:
births = births.query('(births > @mean - 5 * @sigma) & (births < @mean + 5 * @sigma)')
births.index = pd.to_datetime(10000 * births.year + 100 * births.month + births.day,
                              format='%Y%m%d')
# @ is used to specify variable defined outside query

#### Average births by day of week

adding day of week column

In [None]:
births['day of week'] = births.index.dayofweek

creating pivot table

In [None]:
births_day = births.pivot_table('births', index='day of week',columns='decade', aggfunc='mean')
births_day.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
births_day

visualisation

In [None]:
births_day.plot()
plt.ylabel("Average Births by Day")
plt.show()

we can see that births were less common on weekends as compared to weekdays

#### average births by day of year

In [None]:
births_month = births.pivot_table('births', [births.index.month, births.index.day])
print(births_month.head())

births_month.index = [pd.Timestamp(2000, month, day)
                      for (month, day) in births_month.index]
print(births_month.head())

In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
births_month.plot(ax=ax)
plt.show()