# Preprocessing

With all files prepared, a little bit of preprocessing needs to be done to make sure files are in order for actual modelling and predictions. Some of the preprocessing steps in this notebook ensure that,
- The data is clean
- Units are correct and follow ISU
- There are no missing files or values (checked in previous step)
- There are no NaN values that would mess with numerical calculations


## 1. Setup

In [1]:
# Imports
from PIL import Image
import numpy as np
import pandas as pd

In [29]:
# User Variables & Parameters
# Control Parameters
prec_FileName           = 'prec_'
tmin_FileName           = 'tmin_'
tmax_FileName           = 'tmax_'
prec_DataPath           = '../../WaterBucket/data/isb/prec/'
tmin_DataPath           = '../../WaterBucket/data/isb/tmin/'
tmax_DataPath           = '../../WaterBucket/data/isb/tmax/'
startYear               = 1961
endYear                 = 2018
numYears                = (endYear - startYear) + 1
fromMonth               = 1
toMonth                 = 12

# Reading Parameters
prec_FilePathList       = []
tmin_FilePathList       = []
tmax_FilePathList       = []
prec_FileList           = []
tmin_FileList           = []
tmax_FileList           = []

# Preprocessing Parameters
precDF                  = []
tminDF                  = []
tmaxDF                  = []
precNP                  = []
tminNP                  = []
tmaxNP                  = []
tempArr                 = []
isNan                   = []

## 2. Reading Files

### 2.1 Precipitation

In [30]:
# Filename generation for batch processing
for y in range(startYear, endYear + 1):
    for m in range(fromMonth, toMonth + 1):

        # Handling case where month name is 01, 02, to 09
        monthName       = ''
        if m < 10:
            monthName   = '0' + str(m)
        else:
            monthName   = str(m)
        
        # Calculaing imagepath and searching if it exists on the system
        fileName        = prec_FileName + str(y) + '-' + str(monthName) + '.tif'
        imagePath       = prec_DataPath + fileName
        prec_FilePathList.append(imagePath)

In [31]:
# Reading all files into a list ~(1.2MB)
for filePath in prec_FilePathList:
    currentImage = Image.open(filePath)
    prec_FileList.append(currentImage)

### 2.2. Minumum Temperature

In [32]:
# Filename generation for batch processing
for y in range(startYear, endYear + 1):
    for m in range(fromMonth, toMonth + 1):

        # Handling case where month name is 01, 02, to 09
        monthName       = ''
        if m < 10:
            monthName   = '0' + str(m)
        else:
            monthName   = str(m)
        
        # Calculaing imagepath and searching if it exists on the system
        fileName        = tmin_FileName + str(y) + '-' + str(monthName) + '.tif'
        imagePath       = tmin_DataPath + fileName
        tmin_FilePathList.append(imagePath)

In [33]:
# Reading all files into a list ~(1.2MB)
for filePath in tmin_FilePathList:
    currentImage = Image.open(filePath)
    tmin_FileList.append(currentImage)

### 2.3. Maximum Temperature

In [34]:
# Filename generation for batch processing
for y in range(startYear, endYear + 1):
    for m in range(fromMonth, toMonth + 1):

        # Handling case where month name is 01, 02, to 09
        monthName       = ''
        if m < 10:
            monthName   = '0' + str(m)
        else:
            monthName   = str(m)
        
        # Calculaing imagepath and searching if it exists on the system
        fileName        = tmax_FileName + str(y) + '-' + str(monthName) + '.tif'
        imagePath       = tmax_DataPath + fileName
        tmax_FilePathList.append(imagePath)

In [35]:
# Reading all files into a list ~(1.2MB)
for filePath in tmax_FilePathList:
    currentImage = Image.open(filePath)
    tmax_FileList.append(currentImage)

## 3. NaN Value Elimination

In [37]:
# Converting files to np arrays
for prec in prec_FileList:
    npPrec = np.array(prec)
    isNan.append(np.isnan(npPrec))

# Converting files to np arrays
for tmin in tmin_FileList:
    npTmin = np.array(tmin)
    isNan.append(np.isnan(npTmin))

# Converting files to np arrays
for tmax in tmax_FileList:
    npTmax = np.array(tmax)
    isNan.append(np.isnan(npTmax))

# Searching for any NaN Value
for file in isNan:
    for list in file:
        for value in list:
            if value == True:
                print("Found NaN Value!")