In [2]:
library("dplyr")
chicago <- readRDS("chicago.rds")


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [3]:
# basic characteristics
dim(chicago)
str(chicago)

'data.frame':	6940 obs. of  8 variables:
 $ city      : chr  "chic" "chic" "chic" "chic" ...
 $ tmpd      : num  31.5 33 33 29 32 40 34.5 29 26.5 32.5 ...
 $ dptp      : num  31.5 29.9 27.4 28.6 28.9 ...
 $ date      : Date, format: "1987-01-01" "1987-01-02" ...
 $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
 $ pm10tmean2: num  34 NA 34.2 47 NA ...
 $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
 $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...


In [4]:
# dplyr select() functionality examples
subset <- select(chicago, city:dptp)
head(subset)

Unnamed: 0_level_0,city,tmpd,dptp
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1,chic,31.5,31.5
2,chic,33.0,29.875
3,chic,33.0,27.375
4,chic,29.0,28.625
5,chic,32.0,28.875
6,chic,40.0,35.125


In [5]:
subset <- select(chicago, ends_with('2'))
str(subset)
subset <- select(chicago, starts_with('d'))
str(subset)


'data.frame':	6940 obs. of  4 variables:
 $ pm25tmean2: num  NA NA NA NA NA NA NA NA NA NA ...
 $ pm10tmean2: num  34 NA 34.2 47 NA ...
 $ o3tmean2  : num  4.25 3.3 3.33 4.38 4.75 ...
 $ no2tmean2 : num  20 23.2 23.8 30.4 30.3 ...
'data.frame':	6940 obs. of  2 variables:
 $ dptp: num  31.5 29.9 27.4 28.6 28.9 ...
 $ date: Date, format: "1987-01-01" "1987-01-02" ...


In [7]:
# dplyr filter() functionality examples
chicago.f <- filter(chicago, pm25tmean2 > 30)
str(chicago.f)

'data.frame':	194 obs. of  8 variables:
 $ city      : chr  "chic" "chic" "chic" "chic" ...
 $ tmpd      : num  23 28 55 59 57 57 75 61 73 78 ...
 $ dptp      : num  21.9 25.8 51.3 53.7 52 56 65.8 59 60.3 67.1 ...
 $ date      : Date, format: "1998-01-17" "1998-01-23" ...
 $ pm25tmean2: num  38.1 34 39.4 35.4 33.3 ...
 $ pm10tmean2: num  32.5 38.7 34 28.5 35 ...
 $ o3tmean2  : num  3.18 1.75 10.79 14.3 20.66 ...
 $ no2tmean2 : num  25.3 29.4 25.3 31.4 26.8 ...


In [9]:
# Info on a single column
summary(chicago.f$pm25tmean2)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.05   32.12   35.04   36.63   39.53   61.50 

In [11]:
# more complex filter()
chicago.f <- filter(chicago, pm25tmean2 > 30 & tmpd > 80)

# select() can also just list out the columns you want
select(chicago.f, date, tmpd, pm25tmean2)

date,tmpd,pm25tmean2
<date>,<dbl>,<dbl>
1998-08-23,81,39.6
1998-09-06,81,31.5
2001-07-20,82,32.3
2001-08-01,84,43.7
2001-08-08,85,38.8375
2001-08-09,84,38.2
2002-06-20,82,33.0
2002-06-23,82,42.5
2002-07-08,81,33.1
2002-07-18,82,38.85


In [16]:
# dplyr arrange() functionality examples - "arrange" is the name of the SORT function
chicago <- arrange(chicago,date)
head(chicago, 3)
tail(chicago, 3)

Unnamed: 0_level_0,city,tmpd,dptp,date,pm25tmean2,pm10tmean2,o3tmean2,no2tmean2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
1,chic,31.5,31.5,1987-01-01,,34.0,4.25,19.9881
2,chic,33.0,29.875,1987-01-02,,,3.304348,23.19099
3,chic,33.0,27.375,1987-01-03,,34.16667,3.333333,23.81548


Unnamed: 0_level_0,city,tmpd,dptp,date,pm25tmean2,pm10tmean2,o3tmean2,no2tmean2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
6938,chic,35,29.4,2005-12-29,7.45,23.5,6.794837,19.97222
6939,chic,36,31.0,2005-12-30,15.05714,19.2,3.03442,22.80556
6940,chic,35,30.1,2005-12-31,15.0,23.5,2.53125,13.25


In [17]:
# descending
chicago <- arrange(chicago, desc(date))
head(chicago, 3)
tail(chicago, 3)

Unnamed: 0_level_0,city,tmpd,dptp,date,pm25tmean2,pm10tmean2,o3tmean2,no2tmean2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
1,chic,35,30.1,2005-12-31,15.0,23.5,2.53125,13.25
2,chic,36,31.0,2005-12-30,15.05714,19.2,3.03442,22.80556
3,chic,35,29.4,2005-12-29,7.45,23.5,6.794837,19.97222


Unnamed: 0_level_0,city,tmpd,dptp,date,pm25tmean2,pm10tmean2,o3tmean2,no2tmean2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
6938,chic,33.0,27.375,1987-01-03,,34.16667,3.333333,23.81548
6939,chic,33.0,29.875,1987-01-02,,,3.304348,23.19099
6940,chic,31.5,31.5,1987-01-01,,34.0,4.25,19.9881


In [18]:
# dplyr rename() functionality examples
chicago <- rename(chicago, dewpoint = dptp, pm25 = pm25tmean2)
head(chicago)

Unnamed: 0_level_0,city,tmpd,dewpoint,date,pm25,pm10tmean2,o3tmean2,no2tmean2
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
1,chic,35,30.1,2005-12-31,15.0,23.5,2.53125,13.25
2,chic,36,31.0,2005-12-30,15.05714,19.2,3.03442,22.80556
3,chic,35,29.4,2005-12-29,7.45,23.5,6.794837,19.97222
4,chic,37,34.5,2005-12-28,17.75,27.5,3.260417,19.28563
5,chic,40,33.6,2005-12-27,23.56,27.0,4.46875,23.5
6,chic,35,29.6,2005-12-26,8.4,8.5,14.041667,16.81944


In [None]:
# dplyr mutate()functionality examples
# for ADDING new columns, usually based on a transformation of the existing into

chicago <- 