# Asthma-Like Illness Emergency Department Presentations (monthly) | Processing

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [24]:
import pandas as pd

## Load Dataset

In [25]:
df = pd.read_csv('raw.csv')

## Data Manipulation

Rename columns to match Air Quality data set.

In [26]:
# Rename columns to match Air Quality data set.
df = df.rename(columns={'LHD': 'lhd', 'Period': 'year-month', 'Sex': 'sex'})                                    # Rename columns
df.columns = df.columns.str.lower()                                                                                 # Lowercase column names

# Remove ' LHD' for Local Health District values.
df['lhd'] = df['lhd'].str.replace(' LHD', '')                                                                       # Remove ' LHD' from lhd column

# Remove rows representing state-wide aggregated data.
df = df.dropna(subset=['lhd'])                                                                                      # Remove rows with missing values in 'lhd' column
df = df[~df['lhd'].str.contains('All')]                                                                             # Remove rows with 'All' in 'lhd' column

# Remove columns holding Confidence Interval data.
df = df.loc[:, ~df.columns.str.contains('% ci')]                                                                    # Remove columns with '% ci' in the name



df.head()

Unnamed: 0,sex,lhd,year-month,"rate per 100,000 population"
0,Males,Sydney,2014-07,22.6
1,Males,Sydney,2014-08,28.9
2,Males,Sydney,2014-09,15.7
3,Males,Sydney,2014-10,19.2
4,Males,Sydney,2014-11,19.7


### Reconfigure Table

In [27]:
# Pivot the dataframe to have 'sex' as columns
df = df.pivot_table(index=['year-month', 'lhd'], columns='sex', values='rate per 100,000 population').reset_index()

# Rename the columns to match the desired format
df.columns.name = None
df = df.rename(columns={
    'Persons': 'Persons rate per 100,000 population',
    'Males': 'Male rate per 100,000 population', 
    'Females': 'Female rate per 100,000 population'
})                        

# View the DataFrame
df.head() 

Unnamed: 0,year-month,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2014-07,Central Coast,29.8,21.8,26.1
1,2014-07,Hunter New England,45.3,32.3,39.0
2,2014-07,Illawarra Shoalhaven,42.1,26.7,34.5
3,2014-07,Mid North Coast,58.4,35.5,47.3
4,2014-07,Murrumbidgee,46.2,51.4,48.9


## Set Date Range

In [28]:
# Add rows for each LHD for the missing months until 2023-12.
missing_rows = []                                                                                                   # Create a list to store the missing rows.
headers = df.columns.tolist()                                                                                       # Get the headers of the DataFrame.
months = pd.date_range(start='2014-07', end='2024-07', freq='M').strftime('%Y-%m').tolist()                         # Create a list of months from 2014-07 to 2024-06.

lhds = df['lhd'].unique()                                                                                           # Get unique LHD values.

for lhd in lhds:
    for month in months:
        if not ((df['lhd'] == lhd) & (df['year-month'] == month)).any():                                            # Check if the row is missing.
            missing_rows.append({                                                                                   # Append the missing row to the list.
                'lhd': lhd, 
                'year-month': month, 
                'Female rate per 100,000 population': None,
                'Male rate per 100,000 population': None,
                'Persons rate per 100,000 population': None,
            })          

# Create a DataFrame from the missing rows and concatenate it to the original DataFrame
if missing_rows:
    df_missing = pd.DataFrame(missing_rows)                                                                         # Create a DataFrame from the missing rows.
    df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.

# Sort the DataFrame by 'lhd' and 'year-month'.
df = df.sort_values(by=['lhd', 'year-month']).reset_index(drop=True)                                                # Sort the DataFrame by 'lhd' and 'year-month'.

# View the DataFrame.
df.tail()                                                                                                           # View the last 5 rows of the DataFrame.

  months = pd.date_range(start='2014-07', end='2024-07', freq='M').strftime('%Y-%m').tolist()                         # Create a list of months from 2014-07 to 2023-12.
  df = pd.concat([df, df_missing], ignore_index=True)                                                             # Concatenate the DataFrames.


Unnamed: 0,year-month,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
1675,2024-02,Western Sydney,,,
1676,2024-03,Western Sydney,,,
1677,2024-04,Western Sydney,,,
1678,2024-05,Western Sydney,,,
1679,2024-06,Western Sydney,,,


Fill missing values using linear interpolation.

In [30]:
df['Female rate per 100,000 population'] = df['Female rate per 100,000 population'].interpolate()                                       # Fill missing values using linear interpolation.
df['Male rate per 100,000 population'] = df['Male rate per 100,000 population'].interpolate()                                        # Fill missing values using linear interpolation.
df['Persons rate per 100,000 population'] = df['Persons rate per 100,000 population'].interpolate()                                     # Fill missing values using linear interpolation.

### Output Alternative Processed Dataset

In [31]:
# Save the processed data to a new CSV file.
df.to_csv('processed.csv', index=False)                                                                            # Save the processed data to a new CSV file.

# View the DataFrame.
df.head()                                                                                                          # View the first 5 rows of the DataFrame.

Unnamed: 0,year-month,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population"
0,2014-07,Central Coast,29.8,21.8,26.1
1,2014-08,Central Coast,34.3,32.0,33.4
2,2014-09,Central Coast,28.4,19.9,24.1
3,2014-10,Central Coast,28.0,19.5,24.0
4,2014-11,Central Coast,28.0,21.7,25.0
