# CDC Natality Data 2021, reformat
Data Source: https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm

Files used: 2021 'Birth Data User Guide' and 'U.S Data'

### Part Two: Reformat .txt dataset to .csv using dictionary generated above

In [1]:
#extract dataset and split into lines
filename = 'nat2021us.txt'
f = open(filename, 'r')
file = f.read()

#split into lines
lines = file.split('\n')

#remove empty lines
lines = list(filter(lambda l: len(l) > 0, lines))

In [2]:
#function for getting cell given line and position
def get_cell(line, strt, end):
    if strt == end:
        return line[strt-1]
    else:
        return line[strt-1:end]

In [3]:
#write function to format txt as dataframe with column table
import random
import numpy as np
import pandas as pd

col_table = pd.read_csv('Column_Info.csv')

def lines_to_dataframe(lines):
    headers = []
    columns = []
    for index, row in col_table.iterrows():
        if 'FILLER' not in row['Field']:
            headers.append(row['Field'])
            col = list(map(lambda line: get_cell(line,  row['Start'], row['End']), lines))
            columns.append(col)
    columns = np.array(columns).T
    return pd.DataFrame(columns, columns = headers)

#test with sample
sample_size = 1000
sample = random.sample(lines, sample_size)
sdf = lines_to_dataframe(sample)
sdf.to_csv('{}_sample_{}.csv.zip'.format(filename[:filename.index('.')], sample_size), index = False, compression = 'zip') #save sample
sdf.describe() #sample description

Unnamed: 0,DOB_YY,DOB_MM,DOB_TT,DOB_WK,BFACIL,F_FACILITY,BFACIL3,MAGE_IMPFLG,MAGE_REPFLG,MAGER,...,F_CA_CLEFTLP,F_CA_CLEFT,F_CA_DOWNS,F_CA_CHROM,F_CA_HYPOS,NO_CONGEN,ITRAN,ILIVE,BFED,F_BFED
count,1000,1000,1000,1000,1000,1000,1000,1000.0,1000.0,1000,...,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,1,12,706,7,6,2,2,1.0,1.0,34,...,2,2,2,2,2,2,3,2,3,2
top,2021,8,1046,3,1,1,1,,,30,...,1,1,1,1,1,1,N,Y,Y,1
freq,1000,104,4,169,984,997,984,1000.0,1000.0,83,...,997,997,997,997,997,999,992,998,759,883


In [4]:
sdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 227 entries, DOB_YY to F_BFED
dtypes: object(227)
memory usage: 1.7+ MB


In [5]:
sdf.head()  #sample head

Unnamed: 0,DOB_YY,DOB_MM,DOB_TT,DOB_WK,BFACIL,F_FACILITY,BFACIL3,MAGE_IMPFLG,MAGE_REPFLG,MAGER,...,F_CA_CLEFTLP,F_CA_CLEFT,F_CA_DOWNS,F_CA_CHROM,F_CA_HYPOS,NO_CONGEN,ITRAN,ILIVE,BFED,F_BFED
0,2021,5,221,7,1,1,1,,,35,...,1,1,1,1,1,1,N,Y,Y,1
1,2021,12,1852,5,1,1,1,,,23,...,1,1,1,1,1,1,N,Y,N,1
2,2021,1,1008,1,1,1,1,,,44,...,1,1,1,1,1,1,N,Y,Y,1
3,2021,6,1046,5,1,1,1,,,28,...,1,1,1,1,1,1,N,Y,U,0
4,2021,12,1844,1,1,1,1,,,24,...,1,1,1,1,1,1,N,Y,Y,1


In [6]:
#convert entire dataset to dataframe and outputs description
df = lines_to_dataframe(lines)

In [7]:
#save to csv
df.to_csv('{}.csv'.format(filename[:filename.index('.')]), index = False)

In [8]:
df.describe()

Unnamed: 0,DOB_YY,DOB_MM,DOB_TT,DOB_WK,BFACIL,F_FACILITY,BFACIL3,MAGE_IMPFLG,MAGE_REPFLG,MAGER,...,F_CA_CLEFTLP,F_CA_CLEFT,F_CA_DOWNS,F_CA_CHROM,F_CA_HYPOS,NO_CONGEN,ITRAN,ILIVE,BFED,F_BFED
count,3669928,3669928,3669928,3669928,3669928,3669928,3669928,3669928.0,3669928.0,3669928,...,3669928,3669928,3669928,3669928,3669928,3669928,3669928,3669928,3669928,3669928
unique,1,12,1441,7,8,2,3,2.0,2.0,39,...,2,2,2,2,2,3,3,3,3,2
top,2021,8,804,4,1,1,1,,,31,...,1,1,1,1,1,1,N,Y,Y,1
freq,3669928,330740,5090,593664,3588681,3664334,3588681,3669437.0,3669707.0,242336,...,3664334,3664334,3664334,3664334,3664334,3649744,3619928,3656273,2687996,3138746


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3669928 entries, 0 to 3669927
Columns: 227 entries, DOB_YY to F_BFED
dtypes: object(227)
memory usage: 6.2+ GB


In [10]:
df.head()

Unnamed: 0,DOB_YY,DOB_MM,DOB_TT,DOB_WK,BFACIL,F_FACILITY,BFACIL3,MAGE_IMPFLG,MAGE_REPFLG,MAGER,...,F_CA_CLEFTLP,F_CA_CLEFT,F_CA_DOWNS,F_CA_CHROM,F_CA_HYPOS,NO_CONGEN,ITRAN,ILIVE,BFED,F_BFED
0,2021,1,636,7,1,1,1,,,22,...,1,1,1,1,1,1,N,Y,Y,1
1,2021,1,259,7,1,1,1,,,31,...,1,1,1,1,1,1,N,Y,Y,1
2,2021,1,223,1,1,1,1,,,29,...,1,1,1,1,1,1,N,Y,Y,1
3,2021,1,241,1,1,1,1,,,39,...,1,1,1,1,1,1,N,Y,Y,1
4,2021,1,503,1,1,1,1,,,20,...,1,1,1,1,1,1,N,Y,N,1


In [11]:
#compress dataframe to .zip
df.to_csv('{}.csv.zip'.format(filename[:filename.index('.')]), index=False, compression="zip")