# Integrate Monthly Datasets

## Set Up

Ensure that the required libraries are available by running the below code in the terminal before execution:
- pip install pandas


Execute the following in the jupyter notebook before execution to ensure that the required libraries are imported:

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

## Load Datasets

In [4]:
# Load data into dataframes.
df_air_quality = pd.read_csv('../../air-quality/processed-monthly.csv')
df_asthma_edp = pd.read_csv('../../health-stats/asthma-edp-monthly/processed.csv')

# Get headers from all dataframes and create a table to show what they have.
headers = {
    'Air Quality': df_air_quality.columns.tolist(),
    'Asthma Emergency Department Presentations': df_asthma_edp.columns.tolist(),
}

# Create a dataframe to display the headers.
df_headers = pd.DataFrame.from_dict(headers, orient='index').transpose()                                                   # Create a dataframe from the headers dictionary.
df_headers = df_headers.transpose()                                                                                        # Transpose the dataframe to have pd names on the left and headers going across the screen.
df_headers                                                                                                                 # Display the dataframe.

Unnamed: 0,0,1,2,3,4,5,6,7
Air Quality,year-month,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm
Asthma Emergency Department Presentations,year-month,lhd,"Female rate per 100,000 population","Male rate per 100,000 population","Persons rate per 100,000 population",,,


## Data Manipulation

Rename columns for clarity.

In [5]:
# Asthma Emergency Department Presentations
df_asthma_edp = df_asthma_edp.rename(columns={
    'Persons rate per 100,000 population': 'asthma edp',
    'Female rate per 100,000 population': 'asthma edp [f]',
    'Male rate per 100,000 population': 'asthma edp [m]'
})

# Get headers from all dataframes and create a table to show what they have.
headers = {
    'Air Quality': df_air_quality.columns.tolist(),
    'Asthma Emergency Department Presentations': df_asthma_edp.columns.tolist(),
}

# Create a dataframe to display the headers.
df_headers = pd.DataFrame.from_dict(headers, orient='index').transpose()                                                   # Create a dataframe from the headers dictionary.
df_headers = df_headers.transpose()                                                                                        # Transpose the dataframe to have pd names on the left and headers going across the screen.
df_headers                                                                                                                 # Display the dataframe.

Unnamed: 0,0,1,2,3,4,5,6,7
Air Quality,year-month,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm
Asthma Emergency Department Presentations,year-month,lhd,asthma edp [f],asthma edp [m],asthma edp,,,


## Merge Datasets

Merge dataframes on 'financial year' and 'lhd' - inner.

In [6]:
# Merge each dataframe on 'financial year' and 'lhd' columns
df_merged = pd.merge(df_air_quality, df_asthma_edp, on=['year-month', 'lhd'], how='inner')

# Sort the merged dataframe by 'lhd' and 'financial year'.
df_merged = df_merged.sort_values(by=['lhd', 'year-month'])

# View headers of merged dataframe.
df_merged.head()

Unnamed: 0,year-month,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma edp [f],asthma edp [m],asthma edp
0,2014-07,Central Coast,0.1,0.5,0.6,1.7,10.6,0.0,29.8,21.8,26.1
1,2014-08,Central Coast,0.1,0.3,0.6,1.9,11.9,0.0,34.3,32.0,33.4
2,2014-09,Central Coast,0.1,0.2,0.6,2.0,12.5,0.0,28.4,19.9,24.1
3,2014-10,Central Coast,0.1,0.2,0.5,2.3,20.0,0.1,28.0,19.5,24.0
4,2014-11,Central Coast,0.1,0.0,0.3,2.4,20.8,0.1,28.0,21.7,25.0


## Divide into Gendered and Genderless Datasets.

In [7]:
df_gendered = df_merged.iloc[:, :8]
df_genderless = df_merged.iloc[:, :8]

# Split the merged dataframe 
for column in df_merged.columns[8:]:
    if '[m]' in column or '[f]' in column:
        df_gendered[column] = df_merged[column]
    else:
        df_genderless[column] = df_merged[column]

In [8]:
df_gendered.head()

Unnamed: 0,year-month,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma edp [f],asthma edp [m]
0,2014-07,Central Coast,0.1,0.5,0.6,1.7,10.6,0.0,29.8,21.8
1,2014-08,Central Coast,0.1,0.3,0.6,1.9,11.9,0.0,34.3,32.0
2,2014-09,Central Coast,0.1,0.2,0.6,2.0,12.5,0.0,28.4,19.9
3,2014-10,Central Coast,0.1,0.2,0.5,2.3,20.0,0.1,28.0,19.5
4,2014-11,Central Coast,0.1,0.0,0.3,2.4,20.8,0.1,28.0,21.7


In [9]:
df_genderless.head()

Unnamed: 0,year-month,lhd,CO ppm,NO pphm,NO2 pphm,OZONE pphm,PM10 µg/m³,SO2 pphm,asthma edp
0,2014-07,Central Coast,0.1,0.5,0.6,1.7,10.6,0.0,26.1
1,2014-08,Central Coast,0.1,0.3,0.6,1.9,11.9,0.0,33.4
2,2014-09,Central Coast,0.1,0.2,0.6,2.0,12.5,0.0,24.1
3,2014-10,Central Coast,0.1,0.2,0.5,2.3,20.0,0.1,24.0
4,2014-11,Central Coast,0.1,0.0,0.3,2.4,20.8,0.1,25.0


## Output Dataset

### Missing Values as NaN

In [10]:
df_genderless.to_csv('missing-pollutant-values-as-NaN/data.csv', index=False)
df_gendered.to_csv('missing-pollutant-values-as-NaN/data-gendered.csv', index=False)

### Missing Values as 'NA'

In [11]:
# Replace missing values with 'NA'
df_genderless_na = df_genderless.fillna('NA')
df_gendered_na = df_gendered.fillna('NA')

# Save the dataframes to csv files
df_genderless_na.to_csv('missing-pollutant-values-as-NA/data.csv', index=False)
df_gendered_na.to_csv('missing-pollutant-values-as-NA/data-gendered.csv', index=False)

### Dropped Column if Missing Value

In [12]:
# Drop columns with missing values
df_genderless_drop_column = df_genderless.dropna(axis=1, how='any')
df_gendered_drop_column = df_gendered.dropna(axis=1, how='any')

# Save the dataframes to csv files
df_genderless_drop_column.to_csv('dropped-column-if-missing-value/data.csv', index=False)
df_gendered_drop_column.to_csv('dropped-column-if-missing-value/data-gendered.csv', index=False)

### Dropped Row if Missing Value

In [13]:
# Drop rows with missing values
df_genderless_drop_row = df_genderless.dropna(axis=0, how='any')
df_gendered_drop_row = df_gendered.dropna(axis=0, how='any')

# Save the dataframes to csv files
df_genderless_drop_row.to_csv('dropped-row-if-missing-value/data.csv', index=False)
df_gendered_drop_row.to_csv('dropped-row-if-missing-value/data-gendered.csv', index=False)

### Missing Pollutant Values Filled Somehow

In [14]:
# Function to fill missing values for a specific pollutant using a regression model
def fill_missing_values(data, target_column, features):
    # Separate data where the target column is not missing
    data_with_values = data[data[target_column].notna()]
    
    # Separate data where the target column is missing
    data_missing_values = data[data[target_column].isna()]
    
    # If there are no missing values, return the original data
    if data_missing_values.empty:
        return data
    
    # Prepare the training data
    X_train = data_with_values[features]
    y_train = data_with_values[target_column]
    
    # Impute missing values in the features
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    
    # Train the regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict missing values
    X_test = data_missing_values[features]
    X_test = imputer.transform(X_test)
    predicted_values = model.predict(X_test)
    
    # Fill the missing values in the original dataset
    data.loc[data[target_column].isna(), target_column] = predicted_values
    
    return data


# Fill missing values for CO ppm
df_genderless = fill_missing_values(df_genderless, 'CO ppm', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'CO ppm', df_gendered.columns[2:])

# Fill missing values for PM10 µg/m³
df_genderless = fill_missing_values(df_genderless, 'PM10 µg/m³', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'PM10 µg/m³', df_gendered.columns[2:])

# Fill missing values for SO2 pphm
df_genderless = fill_missing_values(df_genderless, 'SO2 pphm', df_genderless.columns[2:])
df_gendered = fill_missing_values(df_gendered, 'SO2 pphm', df_gendered.columns[2:])

# Save the dataframes to csv files
df_genderless.to_csv('missing-pollutant-values-filled-somehow/data.csv', index=False)
df_gendered.to_csv('missing-pollutant-values-filled-somehow/data-gendered.csv', index=False)