# Climate Indices analysis  for subseaonal forecast

### Analysis for Climate Indices :
1. El Nino-Southern Oscillation (ENSO) -- South Oscillation index(SOI)
2. North Atlantic Oscillation (NAO)
3. Pacific North/America (PNA)
4. Tropical North Hemisphere (TNH)
5. Eastern Pacific Oscillation (EPO). The EPO is now named East Pacific/North Pacific Oscillation (EP/NP). 



In [1]:
# Importing  necessary libraries 

import pandas as pd
from pylab import *
import numpy as np
from matplotlib import dates
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pylab
import sys
import matplotlib.dates as mdates
from glob import glob

### 1.Data Preprocessing/Data Wrangling
Climate indices data was collected online and downloaded 

#### 1.1. Reading NAO data 

In [2]:
nao = pd.read_table('C:/Users/santh/MIDAS/data/nao_wide.txt', delim_whitespace=True, names=('Year','01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'),
                   dtype={'Year': np.int64, '01': np.float64, '01': np.float64, '03': np.float64, '04': np.float64, '05': np.float64, '06': np.float64, '07': np.float64, '08': np.float64, '09': np.float64, '10': np.float64, '11': np.float64, '12': np.float64})
nao.head()

#import urllib2
#url = 'http://www.esrl.noaa.gov/psd/data/correlation/amon.us.data'
#req = urllib2.Request ( url )
#raw_data = urllib2.urlopen(req).readlines()
#raw_data[-10:-4]

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
0,1950,0.92,0.4,-0.36,0.73,-0.59,-0.06,-1.26,-0.05,0.25,0.85,-1.26,-1.02
1,1951,0.08,0.7,-1.02,-0.22,-0.59,-1.64,1.37,-0.22,-1.36,1.87,-0.39,1.32
2,1952,0.93,-0.83,-1.49,1.01,-1.12,-0.4,-0.09,-0.28,-0.54,-0.73,-1.13,-0.43
3,1953,0.33,-0.49,-0.04,-1.67,-0.66,1.09,0.4,-0.71,-0.35,1.32,1.04,-0.47
4,1954,0.37,0.74,-0.83,1.34,-0.09,-0.25,-0.6,-1.9,-0.44,0.6,0.4,0.69


In [3]:
#NAO data is in monthly wide format. Converting it into Long format.
months = list(nao.columns)[1:13]

# use pd.melt to convert to long data. 
nao = pd.melt(nao, id_vars = ['Year'], value_vars = months)
nao.rename(columns = {'variable': 'Month', 'value': 'NAO'}, inplace = True)
nao.head()

Unnamed: 0,Year,Month,NAO
0,1950,1,0.92
1,1951,1,0.08
2,1952,1,0.93
3,1953,1,0.33
4,1954,1,0.37


In [4]:
# Combining 'Year' and 'Month' to form a Pandas datetime feature

nao["dateTime"] = nao["Year"].astype(str) + nao["Month"].astype(str) 
dfList = nao["dateTime"].tolist()
#create a new empty list to hold true datetime values
dateList = []
#run datetime function
for d in dfList:
    dates = datetime.strptime(d, '%Y%m')
    dateList.append(dates)
nao["dateTime"] = dateList
nao.head()

Unnamed: 0,Year,Month,NAO,dateTime
0,1950,1,0.92,1950-01-01
1,1951,1,0.08,1951-01-01
2,1952,1,0.93,1952-01-01
3,1953,1,0.33,1953-01-01
4,1954,1,0.37,1954-01-01


In [5]:
#The data frame has data sorted by months, sorting it by year
nao = nao.sort_values(by='dateTime')
nao.head()

Unnamed: 0,Year,Month,NAO,dateTime
0,1950,1,0.92,1950-01-01
72,1950,2,0.4,1950-02-01
144,1950,3,-0.36,1950-03-01
216,1950,4,0.73,1950-04-01
288,1950,5,-0.59,1950-05-01


In [6]:
nao.tail()

Unnamed: 0,Year,Month,NAO,dateTime
575,2021,8,,2021-08-01
647,2021,9,,2021-09-01
719,2021,10,,2021-10-01
791,2021,11,,2021-11-01
863,2021,12,,2021-12-01


#### 1.2. Reading PNA data 

In [7]:
pna = pd.read_table('C:/Users/santh/MIDAS/data/pna_wide.txt', delim_whitespace=True, names=('Year','01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'),
                   dtype={'Year': np.int64, '01': np.float64, '01': np.float64, '03': np.float64, '04': np.float64, '05': np.float64, '06': np.float64, '07': np.float64, '08': np.float64, '09': np.float64, '10': np.float64, '11': np.float64, '12': np.float64})
pna.head()

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
0,1950,-2.34,-1.04,0.24,0.01,-0.55,-1.97,-0.43,0.82,-1.15,0.49,-1.81,0.02
1,1951,-0.41,-1.36,-0.81,0.73,1.26,-0.61,-0.02,0.05,-0.05,0.24,0.23,-1.29
2,1952,-1.98,0.28,-0.4,1.16,-0.99,0.39,-1.23,0.8,0.53,0.8,0.82,0.93
3,1953,0.65,1.0,-0.15,1.05,1.2,-1.6,0.74,-1.12,-0.12,1.91,1.65,1.27
4,1954,-1.14,-0.14,-1.17,-2.7,0.64,-0.02,0.3,-0.08,0.0,0.75,1.45,-0.52


In [8]:
#PNA data is in monthly wide format. Converting it into Long format.
months = list(pna.columns)[1:13]

# use pd.melt to convert to long data. 
pna = pd.melt(pna, id_vars = ['Year'], value_vars = months)
pna.rename(columns = {'variable': 'Month', 'value': 'PNA'}, inplace = True)

# Combining 'Year' and 'Month' to form a Pandas datetime feature

pna["dateTime"] = pna["Year"].astype(str) + pna["Month"].astype(str) 
dfList = pna["dateTime"].tolist()
#create a new empty list to hold true datetime values
dateList = []
#run datetime function
for d in dfList:
    dates = datetime.strptime(d, '%Y%m')
    dateList.append(dates)
pna["dateTime"] = dateList
pna.head()


#The data frame has data sorted by months, sorting it by year
pna = pna.sort_values(by='dateTime')
pna.head()

Unnamed: 0,Year,Month,PNA,dateTime
0,1950,1,-2.34,1950-01-01
72,1950,2,-1.04,1950-02-01
144,1950,3,0.24,1950-03-01
216,1950,4,0.01,1950-04-01
288,1950,5,-0.55,1950-05-01


In [9]:
pna.tail()

Unnamed: 0,Year,Month,PNA,dateTime
575,2021,8,,2021-08-01
647,2021,9,,2021-09-01
719,2021,10,,2021-10-01
791,2021,11,,2021-11-01
863,2021,12,,2021-12-01


In [10]:
#Drop date time for both NAO and PNA dataframes and reset dateTime as index
nao = nao.drop(['Year','Month'], axis = 1)
nao = nao.set_index('dateTime')
pna = pna.drop(['Year','Month'], axis = 1)
pna = pna.set_index('dateTime')

nao.head()

Unnamed: 0_level_0,NAO
dateTime,Unnamed: 1_level_1
1950-01-01,0.92
1950-02-01,0.4
1950-03-01,-0.36
1950-04-01,0.73
1950-05-01,-0.59


In [12]:
pna.head()

Unnamed: 0_level_0,PNA
dateTime,Unnamed: 1_level_1
1950-01-01,-2.34
1950-02-01,-1.04
1950-03-01,0.24
1950-04-01,0.01
1950-05-01,-0.55


In [13]:
#Merge NAO and PNA datafrme into the final Climate index dataframe
indeces = pd.merge(nao,pna, on='dateTime')
indeces.head()

Unnamed: 0_level_0,NAO,PNA
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
1950-01-01,0.92,-2.34
1950-02-01,0.4,-1.04
1950-03-01,-0.36,0.24
1950-04-01,0.73,0.01
1950-05-01,-0.59,-0.55


In [15]:
indeces.tail(10)

Unnamed: 0_level_0,NAO,PNA
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-01,0.73,-0.97
2021-04-01,-1.43,-1.05
2021-05-01,-1.24,-1.35
2021-06-01,0.77,0.67
2021-07-01,,
2021-08-01,,
2021-09-01,,
2021-10-01,,
2021-11-01,,
2021-12-01,,


#Observation - NAO and PNA have data only until 2021-07-01

#### 1.3. Read TNH data 

In [16]:
tnh = pd.read_table('C:/Users/santh/MIDAS/data/tnh_wide.txt', delim_whitespace=True, skiprows=6, names=('Year','01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'),
                   dtype={'Year': np.int64, '01': np.float64, '01': np.float64, '03': np.float64, '04': np.float64, '05': np.float64, '06': np.float64, '07': np.float64, '08': np.float64, '09': np.float64, '10': np.float64, '11': np.float64, '12': np.float64})
tnh.head()

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
0,1950,0.55,-1.07,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-0.94
1,1951,-0.09,-1.3,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,0.58
2,1952,-0.21,0.07,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-1.89
3,1953,-0.74,1.04,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,1.4
4,1954,-0.3,-1.4,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-999.9,-1.18


In [17]:
#Replacing missing data with NaN
tnh.replace(-999.9,np.nan,inplace=True)
tnh.tail()

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
67,2017,-0.34,-0.11,,,,,,,,,,1.04
68,2018,-0.33,2.2,,,,,,,,,,-0.62
69,2019,0.34,1.64,,,,,,,,,,-0.22
70,2020,-0.87,1.69,,,,,,,,,,0.17
71,2021,-0.07,1.27,,,,,,,,,,


#Observation - TNH has data only for 3 months Jan, Feb and Decemeber every year. For 2021, it has data only till February

In [18]:
#TNH data is in monthly wide format. Converting it into Long format.
months = list(tnh.columns)[1:13]

# use pd.melt to convert to long data. 
tnh = pd.melt(tnh, id_vars = ['Year'], value_vars = months)
tnh.rename(columns = {'variable': 'Month', 'value': 'TNH'}, inplace = True)

# Combining 'Year' and 'Month' to form a Pandas datetime feature

tnh["dateTime"] = tnh["Year"].astype(str) + tnh["Month"].astype(str) 
dfList = tnh["dateTime"].tolist()
#create a new empty list to hold true datetime values
dateList = []
#run datetime function
for d in dfList:
    dates = datetime.strptime(d, '%Y%m')
    dateList.append(dates)
tnh["dateTime"] = dateList

#The data frame has data sorted by months, sorting it by year
tnh = tnh.sort_values(by='dateTime')

#drop year and month columns and set datetime as index
tnh = tnh.drop(['Year','Month'], axis = 1)
tnh = tnh.set_index('dateTime')
tnh.head()

Unnamed: 0_level_0,TNH
dateTime,Unnamed: 1_level_1
1950-01-01,0.55
1950-02-01,-1.07
1950-03-01,
1950-04-01,
1950-05-01,


In [22]:
tnh.tail(25)

Unnamed: 0_level_0,TNH
dateTime,Unnamed: 1_level_1
2019-12-01,-0.22
2020-01-01,-0.87
2020-02-01,1.69
2020-03-01,
2020-04-01,
2020-05-01,
2020-06-01,
2020-07-01,
2020-08-01,
2020-09-01,


In [23]:
#Merging TNH with Climate Indices dataframe
indeces = pd.merge(indeces,tnh, on='dateTime')
indeces.head()

Unnamed: 0_level_0,NAO,PNA,TNH
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1950-01-01,0.92,-2.34,0.55
1950-02-01,0.4,-1.04,-1.07
1950-03-01,-0.36,0.24,
1950-04-01,0.73,0.01,
1950-05-01,-0.59,-0.55,


#### 1.4. Read EPNP data

In [27]:
epnp = pd.read_table('C:/Users/santh/MIDAS/data/epnp_wide.data', delim_whitespace=True, header=None, skipfooter=3, names=('Year','01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'),
                   dtype={'Year': np.int64, '01': np.float64, '01': np.float64, '03': np.float64, '04': np.float64, '05': np.float64, '06': np.float64, '07': np.float64, '08': np.float64, '09': np.float64, '10': np.float64, '11': np.float64, '12': np.float64})
epnp.head()

  epnp = pd.read_table('C:/Users/santh/MIDAS/data/epnp_wide.data', delim_whitespace=True, header=None, skipfooter=3, names=('Year','01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'),


Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
0,1948,2021.0,,,,,,,,,,,
1,1948,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9
2,1949,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9,-99.9
3,1950,0.91,-1.13,-0.02,-1.87,-0.98,-0.47,1.38,1.87,0.98,-0.88,-0.39,-99.9
4,1951,-0.81,-1.32,0.2,0.02,-0.04,-0.62,0.98,0.41,1.42,-0.11,0.62,-99.9


#Observation - EPNP has no data for years before 1950

In [28]:
# Drop years before 1950
epnp.drop(epnp[epnp['Year'] < 1950].index, inplace = True)

#Replace missing nos. by NaNs
epnp.replace(-99.9,np.nan,inplace=True)
epnp.replace(-99.90,np.nan,inplace=True)
epnp.head()

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
3,1950,0.91,-1.13,-0.02,-1.87,-0.98,-0.47,1.38,1.87,0.98,-0.88,-0.39,
4,1951,-0.81,-1.32,0.2,0.02,-0.04,-0.62,0.98,0.41,1.42,-0.11,0.62,
5,1952,1.59,-1.09,-2.13,-2.44,0.06,-0.25,1.3,1.35,0.35,1.1,-0.62,
6,1953,0.24,-1.51,-2.29,-0.07,-0.63,3.88,0.99,-0.54,0.12,-0.52,-2.15,
7,1954,0.6,-1.75,1.51,0.08,0.2,-1.64,-1.79,0.83,0.31,-0.53,-0.48,


In [29]:
epnp.tail()

Unnamed: 0,Year,01,02,03,04,05,06,07,08,09,10,11,12
70,2017,0.39,0.21,-1.04,1.04,-0.72,0.5,0.0,-1.65,-0.46,-0.58,0.41,
71,2018,0.66,0.23,0.3,-0.18,-0.98,0.05,-0.16,-0.78,-0.83,1.17,1.42,
72,2019,1.07,1.2,2.11,0.67,0.32,1.74,0.1,-1.18,-1.5,-0.47,1.93,
73,2020,-0.6,-1.79,0.44,1.54,0.04,-0.69,-1.97,-2.39,0.06,0.57,-0.66,
74,2021,-0.71,-0.8,-1.29,0.8,0.0,-0.26,-1.28,,,,,


#EPNP has no data in December every year. For 2021, EPNP has data till June.

In [31]:
#EPNP data is in monthly wide format. Converting it into Long format.
months = list(epnp.columns)[1:13]

# use pd.melt to convert to long data. 
epnp = pd.melt(epnp, id_vars = ['Year'], value_vars = months)
epnp.rename(columns = {'variable': 'Month', 'value': 'EPNP'}, inplace = True)
epnp.head()

# Combining 'Year' and 'Month' to form a Pandas datetime feature
df = epnp

df["dateTime"] = df["Year"].astype(str) + df["Month"].astype(str) 
dfList = df["dateTime"].tolist()
#create a new empty list to hold true datetime values
dateList = []
#run datetime function
for d in dfList:
    dates = datetime.strptime(d, '%Y%m')
    dateList.append(dates)
df["dateTime"] = dateList
df.head()

#The data frame has data sorted by months, sorting it by year
epnp = epnp.sort_values(by='dateTime')
epnp.head()

#Drop year and month, set dateTiem as index
epnp = epnp.drop(['Year','Month'], axis = 1)
epnp = epnp.set_index('dateTime')
epnp.head()

Unnamed: 0_level_0,EPNP
dateTime,Unnamed: 1_level_1
1950-01-01,0.91
1950-02-01,-1.13
1950-03-01,-0.02
1950-04-01,-1.87
1950-05-01,-0.98


In [33]:
epnp.tail(15)

Unnamed: 0_level_0,EPNP
dateTime,Unnamed: 1_level_1
2020-10-01,0.57
2020-11-01,-0.66
2020-12-01,
2021-01-01,-0.71
2021-02-01,-0.8
2021-03-01,-1.29
2021-04-01,0.8
2021-05-01,0.0
2021-06-01,-0.26
2021-07-01,-1.28


In [34]:
#Merge EPNP dataframe with the Climate indices dataframe
indeces = pd.merge(indeces,epnp, on='dateTime')
indeces.head()

Unnamed: 0_level_0,NAO,PNA,TNH,EPNP
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1950-01-01,0.92,-2.34,0.55,0.91
1950-02-01,0.4,-1.04,-1.07,-1.13
1950-03-01,-0.36,0.24,,-0.02
1950-04-01,0.73,0.01,,-1.87
1950-05-01,-0.59,-0.55,,-0.98


In [36]:
indeces.tail(10)

Unnamed: 0_level_0,NAO,PNA,TNH,EPNP
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-01,0.73,-0.97,,-1.29
2021-04-01,-1.43,-1.05,,0.8
2021-05-01,-1.24,-1.35,,0.0
2021-06-01,0.77,0.67,,-0.26
2021-07-01,,,,-1.28
2021-08-01,,,,
2021-09-01,,,,
2021-10-01,,,,
2021-11-01,,,,
2021-12-01,,,,


#### 1.5. Read SOI data 

In [42]:
#soi = pd.read_table('C:/Users/santh/MIDAS/data/soi_long.csv', delim_whitespace=",", header = None)


soi = pd.read_table('C:/Users/santh/MIDAS/data/soi_long.csv', sep = ",", skiprows=2, names=('Year','SOI'))
                   #dtype={'Year': np.string, '02': np.float64})
soi.head()

Unnamed: 0,Year,SOI
0,1951-01,1.5
1,1951-02,0.9
2,1951-03,-0.1
3,1951-04,-0.3
4,1951-05,-0.7


In [43]:
soi.tail()

Unnamed: 0,Year,SOI
842,2021-03,0.4
843,2021-04,0.3
844,2021-05,0.5
845,2021-06,0.4
846,2021-07,1.4


#Observation - For 2021, SOI has data till July

In [44]:
soi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    847 non-null    object 
 1   SOI     847 non-null    float64
dtypes: float64(1), object(1)
memory usage: 13.4+ KB


In [45]:
#Change the datatypes of the features
soi["SOI"] = pd.to_numeric(soi["SOI"])
soi["Year"] = pd.to_datetime(soi["Year"])
soi["Year"]


0     1951-01-01
1     1951-02-01
2     1951-03-01
3     1951-04-01
4     1951-05-01
         ...    
842   2021-03-01
843   2021-04-01
844   2021-05-01
845   2021-06-01
846   2021-07-01
Name: Year, Length: 847, dtype: datetime64[ns]

In [46]:
#Rename the "Year" feature to dateTime and reset it as an index
soi = soi.rename(columns={'Year': 'dateTime'})
soi = soi.set_index('dateTime')
soi.head(5)

Unnamed: 0_level_0,SOI
dateTime,Unnamed: 1_level_1
1951-01-01,1.5
1951-02-01,0.9
1951-03-01,-0.1
1951-04-01,-0.3
1951-05-01,-0.7


In [48]:
#Merge SOI dataframe with the Climate indices dataframe
indeces = pd.merge(indeces,soi, on='dateTime')
indeces.head()

Unnamed: 0_level_0,NAO,PNA,TNH,EPNP,SOI
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1951-01-01,0.08,-0.41,-0.09,-0.81,1.5
1951-02-01,0.7,-1.36,-1.3,-1.32,0.9
1951-03-01,-1.02,-0.81,,0.2,-0.1
1951-04-01,-0.22,0.73,,0.02,-0.3
1951-05-01,-0.59,1.26,,-0.04,-0.7


In [49]:
indeces.tail(20)

Unnamed: 0_level_0,NAO,PNA,TNH,EPNP,SOI
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-12-01,1.2,0.18,-0.22,,-0.6
2020-01-01,1.34,-0.24,-0.87,-0.6,0.2
2020-02-01,1.26,0.17,1.69,-1.79,-0.1
2020-03-01,1.01,-2.17,,0.44,-0.1
2020-04-01,-1.02,-1.18,,1.54,0.2
2020-05-01,-0.41,0.21,,0.04,0.4
2020-06-01,-0.15,0.7,,-0.69,-0.4
2020-07-01,-1.23,1.73,,-1.97,0.4
2020-08-01,0.12,1.82,,-2.39,1.1
2020-09-01,0.98,0.75,,0.06,0.9


In [50]:
#Export the Climate indeces data frame to a csv file
indeces.to_csv('C:/Users/santh/MIDAS/data/Climate_indices.csv', index=True)