# Convert to long form

In [1]:
# sets to top level dir
import os
os.chdir('..')

In [2]:
# imports
import pandas as pd
from tqdm import tqdm

In [3]:
# read in file
df = pd.read_csv('data/composite.csv')
print(df.shape)
df.sample(2)

(14070, 10)


Unnamed: 0,Year,County,Indicator,Measure,Jan-Mar,Apr-June,July-Sep,Oct-Dec,Year-to-Date,Case Definition
3825,2016,Martin,All drug non-fatal overdose hospitalizations,Count,40.0,39.0,55.0,32.0,,Non-fatal hospitalizations either of the follo...
6863,2017,Union,Drug Overdose Annual Age-Adjusted Death Rate,"Per 100,000 persons",,,,,7.5,Drugs identified as the cause of death by Flor...


## Begin Analysis

We want the data in long format.

Columns: Year, County, Quarter, Indicator, Value

We can drop the 'Year-to-Date' column (and case definition?)

In [4]:
# remove col
df.drop('Year-to-Date', axis=1, inplace=True)

In [5]:
# rename months into quarters
df.columns = ['Year', 'County', 'Indicator', 'Measure', 'Q1', 'Q2', 'Q3', 'Q4', 'Definition']
df.sample(2)

Unnamed: 0,Year,County,Indicator,Measure,Q1,Q2,Q3,Q4,Definition
11754,2020,Alachua,Drug Confirmed Motor Vehicle Traffic Crashes,Count,,,,,A crash involving a Driver and/or Non-Motorist...
3142,2016,Gulf,Alcohol Confirmed Motor Vehicle Traffic Crash ...,Count,,,,,The death of a person as a direct result of a ...


Now we need to transpose the Quarters into one column with values attached.

We will accomplish this using `pd.wide_to_long` and by specifing `stubnames='Q'` to target the Quarters.

In [6]:
# we have to create an id field to transpose on
df['id'] = df.index
new_df = pd.wide_to_long(df, stubnames='Q', i='id', j='quarter')  # MAIN WORK
new_df.reset_index(inplace=True)  # flatten df
new_df.drop('id', axis=1, inplace=True)  # drop id column, no longer needed
new_df.columns = ['Quarter', 'Indicator', 'Case Definition', 'County', 'Measure', 'Year', 'Value']  # rename cols
new_df['Quarter'] = new_df.Quarter.apply(lambda x: f"Q{x}")  # add 'Q' into quarter column values
new_df.sample(3)

Unnamed: 0,Quarter,Indicator,Case Definition,County,Measure,Year,Value
9766,Q1,Columbia,Drug Overdose deaths,Count,2019,Drugs identified as the cause of death by Flor...,2
12997,Q1,Leon,Heroin-involved non-fatal overdose hospitaliza...,Count,2020,Non-fatal hospitalizations with either of the ...,<5
26684,Q2,Hendry,Neonatal Abstinence Syndrome Birth Defect,Count,2020,Infants less than 28 days old who were exposed...,


In [7]:
# export
new_df.to_csv('data/long_form.csv', index=False)