## Setup

In [1]:
# Loading packages and their components
import pandas as pd
import numpy as np
import pickle
# Setting Pandas options
pd.options.display.max_rows = 999 # For debugging, can be removed later
pd.options.mode.chained_assignment = None  # Disabling the pandas chained assignment warnings

In [11]:
def import_and_preproc():
    # Read in the data
    dengue_features_train = pd.read_csv('data/dengue_features_train.csv')
    dengue_features_test = pd.read_csv('data/dengue_features_test.csv')
    dengue_labels_train = pd.read_csv('data/dengue_labels_train.csv')

    raw_data = [dengue_features_train, dengue_features_test, dengue_labels_train]
    
    # Splitting the data into a San Juan and an Iquitos part
    iq = []
    sj = []
    for item in raw_data:
        sj.append( item[item.city=='sj'] )
        iq.append( item[item.city=='iq'] )

    # Transferring the date column to the label part of the data
    sj[2] = sj[2].join(sj[0]['week_start_date'])
    iq[2] = iq[2].join(iq[0]['week_start_date'])

    # Converting the date column to datetime format
    for i in range(len(sj)):
        sj[i]['week_start_date'] = pd.to_datetime(sj[i]['week_start_date'], format='%Y-%m-%d')  
        iq[i]['week_start_date'] = pd.to_datetime(iq[i]['week_start_date'], format='%Y-%m-%d')
        
    # Putting the date as index
    for i in range(len(sj)):
        sj[i] = sj[i].set_index('week_start_date', drop=True)
        iq[i] = iq[i].set_index('week_start_date', drop=True)
        
    return list([sj[0], sj[1], sj[2], iq[0], iq[1], iq[2]])

data_subsets = import_and_preproc()

## Missing value imputation
Since the environmental values for each week are assumed to follow seasonal patterns, they can not be simply replaced with the mean over the entire study. Intstead, missing values in these variables can be replaced with the mean value of the week before and after, or the week before and after that has no missing values.

In [12]:
environmental_vars = [
    'ndvi_ne',
    'ndvi_nw',
    'ndvi_se', 
    'ndvi_sw',
    'precipitation_amt_mm',
    'reanalysis_air_temp_k',
    'reanalysis_avg_temp_k',
    'reanalysis_dew_point_temp_k',
    'reanalysis_max_air_temp_k',
    'reanalysis_min_air_temp_k',
    'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_relative_humidity_percent',
    'reanalysis_sat_precip_amt_mm',
    'reanalysis_specific_humidity_g_per_kg',
    'reanalysis_tdtr_k',
    'station_avg_temp_c',
    'station_diur_temp_rng_c',
    'station_max_temp_c',
    'station_min_temp_c',
    'station_precip_mm'
                     ]

In [13]:
def replace_missing(df, colnames):
    # Store the time index because the code below is index based and needs numbers
    date = df.index
    df = df.reset_index(drop=True)
    for colname in colnames:
        try: # because there are columns that do not occur in all subsets of the dataset
            miss_idx = df[df[colname].isnull()].index.tolist()
            for idx in miss_idx:
                    # Search the nearest week before the week with the missing value
                    # that itself has no missing value
                    before = df.iloc[:idx,:][colname].dropna().tail(1)
                    # The same but for the weeks after the missing value
                    after = df.iloc[idx:,:][colname].dropna().head(1)
                    # Replace the missing value with the mean
                    df[colname][idx] = np.mean([before, after])
        except:
            continue
    # Re-attach the time index and drop the auxiliary index
    df = df.set_index(date, drop=True)
    return df

In [14]:
enumerate(data_subsets)

<enumerate at 0x7ff36eb23200>

In [15]:
# Applying the Imputation
for i in range(len(data_subsets)):
    data_subsets[i] = replace_missing(data_subsets[i], environmental_vars)

Check if there are still variables with missing values in our dataset.

In [16]:
# Check if there are still variables with missing values in our dataset.
for data_subset in data_subsets:
    print(data_subset.isnull().sum())
    print('---'*10)

city                                     0
year                                     0
weekofyear                               0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
dtype: int6

In [17]:
pickle.dump(data_subsets, open('cleaned_data.pickle', 'wb'))

In [18]:
# Splitting the data into their parts
sj_features_train, \
sj_features_test, \
sj_labels_train, \
iq_features_train, \
iq_features_test, \
iq_labels_train = data_subsets