# Reading the data

In [14]:
import datatable as dt
import numpy as np

In [18]:
dat = dt.fread("met.gz")
dat.head(5)

Unnamed: 0_level_0,USAFID,WBAN,year,month,day,hour,min,lat,lon,elev,…,dew.point,dew.point.qc,atm.press,atm.press.qc,rh
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,Unnamed: 11_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,690150,93121,2019,8,1,0,56,34.3,−116.166,696,…,10.6,5,1009.9,5,19.8813
1,690150,93121,2019,8,1,1,56,34.3,−116.166,696,…,10.6,5,1010.3,5,21.761
2,690150,93121,2019,8,1,2,56,34.3,−116.166,696,…,7.2,5,1010.6,5,18.4821
3,690150,93121,2019,8,1,3,56,34.3,−116.166,696,…,5.0,5,1011.6,5,16.8886
4,690150,93121,2019,8,1,4,56,34.3,−116.166,696,…,5.0,5,1012.7,5,17.3841


# Selecting columns

In [12]:
dat[:, ["USAFID", "lat", "lon"]].head(5)

Unnamed: 0_level_0,USAFID,lat,lon
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,690150,34.3,−116.166
1,690150,34.3,−116.166
2,690150,34.3,−116.166
3,690150,34.3,−116.166
4,690150,34.3,−116.166


In [13]:
dat = dat[:,[
    "USAFID", "WBAN", "year", "month", "day", "hour", "min", "lat", "lon", "elev",
    "wind.sp", "temp", "atm.press"
]]
dat.head(4)

Unnamed: 0_level_0,USAFID,WBAN,year,month,day,hour,min,lat,lon,elev,wind.sp,temp,atm.press
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,690150,93121,2019,8,1,0,56,34.3,−116.166,696,5.7,37.2,1009.9
1,690150,93121,2019,8,1,1,56,34.3,−116.166,696,8.2,35.6,1010.3
2,690150,93121,2019,8,1,2,56,34.3,−116.166,696,6.7,34.4,1010.6
3,690150,93121,2019,8,1,3,56,34.3,−116.166,696,5.1,33.3,1011.6


# Logical operators

In [1]:
# Loading the libraries
import numpy as np
import pandas as pa

# Defining the data
x = [True, True, False, False]
y = [False, True, True, False]
ans = {
    'x'   : x,
    'y'   : y,
    'and' : np.logical_and(x, y),
    'or'  : np.logical_or(x, y),
    'xor' : np.logical_xor(x, y)
    }
pa.DataFrame(ans)

Unnamed: 0,x,y,and,or,xor
0,True,False,False,True,True
1,True,True,True,True,False
2,False,True,False,True,True
3,False,False,False,False,False


This is very different from how R defines a function. This function, which is an alternative implementation of the `xor` operator

In [2]:
def myxor(x,y):
    x1 = np.logical_and(x, np.logical_not(y))
    x2 = np.logical_and(np.logical_not(x), y)
    return np.logical_or(x1, x2)

Can be defined in R as follows:

```r
myxor <- function(x,y) {
    x1 <- x & !y
    x2 <- !x & y
    return(x1 | x2)
}
```

Executing this function in python returns the following:

In [3]:
ans['myxor'] = myxor(x,y)
pa.DataFrame(ans)

Unnamed: 0,x,y,and,or,xor,myxor
0,True,False,False,True,True,True
1,True,True,True,True,False,False
2,False,True,False,True,True,True
3,False,False,False,False,False,False


# Filtering the data

In [5]:
dat[(dt.f.day == 1) & (dt.f.lat > 40) & ((dt.f.elev < 500) | (dt.f.elev > 1000)), :].nrows

27623

# Variable creation

In [20]:
dat[:, dt.update(evel2 = np.power(dt.f.elev, 2), windsp_scaled = dt.f["wind.sp"] / dt.sd(dt.f["wind.sp"]))]
dat.head(10)

Unnamed: 0_level_0,USAFID,WBAN,year,month,day,hour,min,lat,lon,elev,…,atm.press,atm.press.qc,rh,evel2,windsp_scaled
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,Unnamed: 11_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,690150,93121,2019,8,1,0,56,34.3,−116.166,696,…,1009.9,5,19.8813,484416,2.65438
1,690150,93121,2019,8,1,1,56,34.3,−116.166,696,…,1010.3,5,21.761,484416,3.81858
2,690150,93121,2019,8,1,2,56,34.3,−116.166,696,…,1010.6,5,18.4821,484416,3.12006
3,690150,93121,2019,8,1,3,56,34.3,−116.166,696,…,1011.6,5,16.8886,484416,2.37497
4,690150,93121,2019,8,1,4,56,34.3,−116.166,696,…,1012.7,5,17.3841,484416,0.977929
5,690150,93121,2019,8,1,5,56,34.3,−116.166,696,…,1012.7,5,20.0154,484416,0.0
6,690150,93121,2019,8,1,6,56,34.3,−116.166,696,…,1012.8,5,22.8991,484416,0.698521
7,690150,93121,2019,8,1,7,56,34.3,−116.166,696,…,1012.8,5,24.5858,484416,0.977929
8,690150,93121,2019,8,1,8,56,34.3,−116.166,696,…,1012.5,5,29.3298,484416,1.21077
9,690150,93121,2019,8,1,9,56,34.3,−116.166,696,…,1012.7,5,30.2181,484416,0.698521


# Merging data

In [21]:
stations = dt.fread("ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv")

In [22]:
stations.head(5)

Unnamed: 0_level_0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8260,99999,WXPOD8270,,,,0.0,0.0,0.0,20050101,20100920
4,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323


In [28]:
stations[:, "USAF"] = np.array(stations[:,"USAF"]).astype(np.int32)

ValueError: invalid literal for int() with base 10: 'A00002'

ValueError: invalid literal for int() with base 10: 'A00002'

# Logical Operators with Data Table

# Column selection

In [7]:
dat[:,["USAFID", "lat", "lon"]].head(5)

Unnamed: 0_level_0,USAFID,lat,lon
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,690150,34.3,−116.166
1,690150,34.3,−116.166
2,690150,34.3,−116.166
3,690150,34.3,−116.166
4,690150,34.3,−116.166


# Variable creation

In [11]:
dat[:, dt.update(Date = dt.f.year*10000 + dt.f.month*100 + dt.f.day)]
dat[1:5, ["Date", "year", "month", "day"]] # First five rows of Date, year, month, day columns

Unnamed: 0_level_0,Date,year,month,day
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,20190801,2019,8,1
1,20190801,2019,8,1
2,20190801,2019,8,1
3,20190801,2019,8,1


In [None]:
dat_daily = dat[:
  ,
  {
    "atm_press_avg" : dt.mean(dt.f["atm.press"]),
    "temp_avg"      : dt.mean(dt.f.temp)
  },
  dt.by("USAFID", "Date")
]

In [None]:
dat_daily