In [7]:
import netCDF4 as nc
import numpy as np
import pandas as pd

# 训练数据

In [9]:
FILEPATH = 'data/SODA_train.nc'
FILE = nc.Dataset(FILEPATH)

In [10]:
FILE

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): year(100), month(36), lat(24), lon(72)
    variables(dimensions): float32 sst(year, month, lat, lon), float32 t300(year, month, lat, lon), float64 ua(year, month, lat, lon), float64 va(year, month, lat, lon), int32 year(year), int32 month(month), float64 lat(lat), float64 lon(lon)
    groups: 

### 变量含义：温度异常，热含量异常，纬向风异常，经向风异常，年，月，经度，纬度

In [11]:
FILE.variables.keys()

dict_keys(['sst', 't300', 'ua', 'va', 'year', 'month', 'lat', 'lon'])

### 数据维度 ：year, month, lat, lon

In [12]:
FILE.variables

{'sst': <class 'netCDF4._netCDF4.Variable'>
 float32 sst(year, month, lat, lon)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (100, 36, 24, 72)
 filling on, 't300': <class 'netCDF4._netCDF4.Variable'>
 float32 t300(year, month, lat, lon)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (100, 36, 24, 72)
 filling on, 'ua': <class 'netCDF4._netCDF4.Variable'>
 float64 ua(year, month, lat, lon)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (100, 36, 24, 72)
 filling on, 'va': <class 'netCDF4._netCDF4.Variable'>
 float64 va(year, month, lat, lon)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (100, 36, 24, 72)
 filling on, 'year': <class 'netCDF4._netCDF4.Variable'>
 int32 year(year)
 unlimited dimensions: 
 current shape = (100,)
 filling on, default _FillValue of -2147483647 used, 'month': <class 'netCDF4._netCDF4.Variable'>
 int32 month(month)
 unlimited dimensions: 
 current shape = (36,)
 filling on, default _FillValue of -21

### 查看温度异常数据

In [13]:
sst = FILE['sst']

In [14]:
sst

<class 'netCDF4._netCDF4.Variable'>
float32 sst(year, month, lat, lon)
    _FillValue: nan
unlimited dimensions: 
current shape = (100, 36, 24, 72)
filling on

In [15]:
sst_data = sst[:]

In [16]:
sst_data.shape

(100, 36, 24, 72)

### 取出这个维度的数据

另外，如果数据异常，则：真实值 = 打包值 * scale_factor + add_offset

In [18]:
sst_data[0, 0, 1, 1]

1.6091517

### 纬度数据

南纬是负，北纬是正，东经是正，西经是负

In [19]:
lat_data = FILE.variables['lat'][:]

In [20]:
lat_data.shape

(24,)

In [22]:
lat_data

masked_array(data=[-55., -50., -45., -40., -35., -30., -25., -20., -15.,
                   -10.,  -5.,   0.,   5.,  10.,  15.,  20.,  25.,  30.,
                    35.,  40.,  45.,  50.,  55.,  60.],
             mask=False,
       fill_value=1e+20)

### 经度数据

In [24]:
lon = FILE.variables['lon'][:]
lon.shape
lon

masked_array(data=[  0.,   5.,  10.,  15.,  20.,  25.,  30.,  35.,  40.,
                    45.,  50.,  55.,  60.,  65.,  70.,  75.,  80.,  85.,
                    90.,  95., 100., 105., 110., 115., 120., 125., 130.,
                   135., 140., 145., 150., 155., 160., 165., 170., 175.,
                   180., 185., 190., 195., 200., 205., 210., 215., 220.,
                   225., 230., 235., 240., 245., 250., 255., 260., 265.,
                   270., 275., 280., 285., 290., 295., 300., 305., 310.,
                   315., 320., 325., 330., 335., 340., 345., 350., 355.],
             mask=False,
       fill_value=1e+20)

### 年份

注意：15 个为一组模式

In [26]:
year_data = FILE.variables['year'][:]

In [27]:
year_data.shape
year_data

masked_array(data=[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
                    12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,
                    23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
                    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
                    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                    56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
                    67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
                    78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
                    89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
                   100],
             mask=False,
       fill_value=999999)

# 测试数据

In [31]:
LABELPATH = 'data/SODA_label.nc'
LAEBL_FILE = nc.Dataset(LABELPATH)

In [32]:
LAEBL_FILE

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): year(100), month(36)
    variables(dimensions): float64 nino(year, month), int32 year(year), int32 month(month)
    groups: 

In [34]:
LAEBL_FILE.variables.keys()

dict_keys(['nino', 'year', 'month'])

In [35]:
nino_data = LAEBL_FILE.variables['nino'][:]

In [37]:
nino_data.shape

(100, 36)

### 第一年，第一个模式，这 36 个月的异常指数

In [44]:
nino_data[0]

masked_array(data=[-0.40720701, -0.20244436, -0.10386104, -0.02910841,
                   -0.13252996, -0.25527564, -0.3326247 , -0.26293179,
                   -0.08058987,  0.10612228,  0.21165498,  0.16504486,
                    0.06697161,  0.00999126, -0.00164753, -0.06598581,
                   -0.14728385, -0.1915542 , -0.17029583, -0.19376083,
                   -0.27808595, -0.39393583, -0.49975768, -0.58400702,
                   -0.73201811, -0.72508991, -0.58416802, -0.31754819,
                   -0.12046373, -0.03802495,  0.013173  ,  0.06823147,
                    0.0597352 ,  0.02755602,  0.011619  , -0.04931002],
             mask=False,
       fill_value=1e+20)

# 待做

- [ ] 填充数据的处理，或者说，看看有多少填充数据，如何处理这些数据