# Examples - Introduction to R

In [None]:
# this is a comment

## Packages

In [None]:
# install a package called 'caret'
install.packages('caret')

In [None]:
# load the package 'caret'
library('caret')

## Getting help

In [None]:
# general help
help.start()

In [None]:
# list the installed packages
library()

In [None]:
# see the documentation of package 'caret'
library(help='caret')

In [None]:
# search the documentation for 'correlation'
help.search('correlation')
??correlation

In [None]:
# see the documentation for function 'mean'
# - usage (compulsory arguments have no default value)
# - description of arguments
# - description of output
# - further details
help('mean')
?mean

In [None]:
# see the examples of function 'mean'
example(mean)

## Basics

In [None]:
# null value
NULL

In [None]:
# assign a value to a variable
x = 7
# access the variable
x

In [None]:
# list all existing variables
ls()

In [None]:
# remove all variables
rm(list=ls())

#### Special values - NA, Inf, NaN

In [None]:
# not-available; missing value
NA

In [None]:
# infinity
Inf
-Inf
1/0

In [None]:
# not-a-number; indeterminate value
NaN
0/0
0*Inf

#### Functions

In [None]:
# a function with a single argument
samplemode = function(obs)
# code is enclosed in a { } block
{
  counts = table(obs)
  modes = names(counts[counts==max(counts)])
  # if-condition is enclosed in parentheses
  if (is.numeric(obs))
  # code is again enclosed in a { } block
  {
    modes = as.numeric(modes)
  }
  else
  {
    modes = as.factor(modes)
  }
  # the last statement is used as the return value
  # (or use 'return()' explicitly)
  modes
}

#### Printing

In [None]:
# print function
print(1:5)

In [None]:
# merge elements into strings in a pair-wise manner
paste(1:5, 1:5, sep='.')

# merge all the elements into a string
cat(1:5, 1:5, sep='.')

## Vectors

In [None]:
# integer vector of length one
7

# string vector of length one
'somestring'

# Boolean vector of length two
c(TRUE, FALSE)

In [None]:
# concatenate two integer vectors
x = c(7,11)
x

In [None]:
# the elements are concatenated in the given order
y = c(x,1,2,3,x)
y

#### Basic information

In [None]:
# the class of the object
class(y)

In [None]:
# the structure of the object
str(y)

In [None]:
# the number of elements
length(y)

#### Sequence generation

In [None]:
# a sequence of five integers
11:15

In [None]:
# a sequence from 1 to 10
seq(1, 10)

In [None]:
# as previous but in steps of 2
seq(1, 10, by=2)

In [None]:
# as previous but with five elements
seq(1, 10, length.out=5)

In [None]:
# a sequence from 0 to 1 in steps of 0.1
seq(0, 1, by=0.1)

In [None]:
# repeat a vector three times
rep(1:2, times=3)

In [None]:
# repeat each element of a vector three times
rep(1:2, each=3)

In [None]:
# as previous but also the result repeated three times
rep(1:2, each=3, times=3)

## Vector operations

In [None]:
# elements are added in a pair-wise manner
# (1 2) + (10 11) = (1+10 2+11)
1:2 + 10:11

In [None]:
# the shorter vector is repeated
# (1 2) becomes (1 2 1 2) to match in length with (1 2 3 4)
1:2 + 1:4

In [None]:
# this produces a warning since 3 is not a multiple of 2
1:2 + 1:3

In [None]:
# each element of 1:5 is squared
# (the exponent is repeated and hence all elements in 1:5 are paired with 2)
(1:5)^2

In [None]:
# the elements in the two vectors are paired which
# results in (1^1 2^2 3^3 4^4 5^5)
(1:5)^(1:5)

In [None]:
# the square root function maps values to their square roots
sqrt(1:5)

In [None]:
# the sum and product functions reduce the vector into a single value
sum(1:5)
prod(1:5)

In [None]:
# the summary function produces multiple numbers
summary(1:5)

In [None]:
# the t-test function produces a complex data structure
t.test(1:5)

In [None]:
# some observations
x = c(1,5,4,7,2)
x

# the deviations of the observations from their mean
# - sum(x) is a vector of length 1
# - length(x) is a vector of length 1
# - the mean is subtracted from each element of x
x - sum(x) / length(x)

## Boolean vectors

In [None]:
# an integer vector
x = 11:15
x

In [None]:
# a vector indicating which elements are greater than 13
x > 13

In [None]:
# (%% is the modulo operator)
x %% 2
# a vector indicating which elements are even
x %% 2 == 0

In [None]:
# a vector indicating which elements are greater than 12 and less than 15
x > 12 & x < 15

In [None]:
# a vector indicating which elements are less than 12 or greater than 14
x < 12 | x > 14

In [None]:
# a vector indicating which elements are not divisible by three
! x %% 3 == 0

## Factors

In [None]:
# a vector of integers
a = 1:5
a
str(a)

In [None]:
# a factor with five possible values
af = factor(a)
af
str(af)

In [None]:
# a vector of strings
b = c('lo','lo','hi','hi','hi')
b

In [None]:
# a factor with two possible values
bf = factor(b)
bf
str(bf)

In [None]:
# the allowed values can be queried
levels(bf)

In [None]:
# the allowed values can be renamed
# (remember to preserve the order of levels)
levels(bf) = c('high_value', 'low_value')
bf

## Matrices

In [None]:
# two integer vectors
v1 = 1:3
v2 = 11:13
v1
v2

In [None]:
# bind the two vectors into a matrix by rows
rbind(v1,v2)

In [None]:
# bind the two vectors into a matrix by columns
cbind(v1,v2)

#### Basic information

In [None]:
x = cbind(v1,v2)

In [None]:
# the class of the object
class(x)

In [None]:
# the structure of the object
str(x)

In [None]:
# the number of elements
length(x)

In [None]:
# the sizes of the dimensions
dim(x)

In [None]:
# the number of rows
nrow(x)

In [None]:
# the number of columns
ncol(x)

#### Operations

In [None]:
# the sum of all elements
# (dimensions are ignored)
sum(x)

In [None]:
# the sums over the first dimension (rows)
apply(x, 1, sum)

In [None]:
# the sums over the second dimension (columns)
x = apply(x, 2, sum)
x

In [None]:
# note how the column names are carried over to the vector indices
# (named indices will be discussed later)
str(x)

## Lists

In [None]:
# a list containing a vector, a matrix, and a factor
x = list(1:5, matrix(c(3,6,7,1), nrow=2), factor(c('a','b','c')))
x

#### Basic information

In [None]:
# the class of the object
class(x)

In [None]:
# the structure of the object
str(x)

In [None]:
# the number of elements
length(x)

#### Operations

In [None]:
# apply the summary function to each element
sapply(x, summary)

## Data frames

In [None]:
# three vectors of same length
v1 = 1:3
v2 = 11:13
v3 = factor(c('lo','lo','hi'))
v1
v2
v3

In [None]:
# a data frame formed by the three variables
x = data.frame(v1,v2,v3)
x

#### Basic information

In [None]:
# the class of the object
class(x)

In [None]:
# the structure of the object
str(x)

In [None]:
# the number of elements (as list)
length(x)

In [None]:
# the sizes of the dimensions (as matrix)
dim(x)

In [None]:
# the number of rows (as matrix)
nrow(x)

In [None]:
# the number of columns (as matrix)
ncol(x)

## Subsets

#### Vectors

In [None]:
# an integer vector
x = 11:15
x

In [None]:
# the third element of the vector
x[3]

In [None]:
x

# a numeric index vector
idx = c(1,3,5)
idx

# the first, third, and fifth elements of the vector
x[idx]

In [None]:
# all elements except the second
x[-2]

# all elements except the first, third, and fifth
x[-idx]

In [None]:
x
x>13
# a vector of the elements which are greater than 13
x[x>13]

In [None]:
x

# a vector of the even elements
x[x%%2==0]

# a vector of the elements that are greater than 12 and less than 15
x[x>12 & x<15]

# a vector of the elements that are less than 12 or greater than 14
x[x<12 | x>14]

# a vector of the elements that are not divisible by three
x[!x%%3==0]

#### Matrices

In [None]:
v1 = 1:3
v2 = 11:13
x = cbind(v1,v2)
x

In [None]:
# the first element of the first row
x[1,1]

In [None]:
# the whole second row
x[2,]

In [None]:
# the whole first column
x[,1]

In [None]:
# all rows except the second
x[-2,]

In [None]:
# (row) indices
idx = c(2,3)

# the second and third rows
x[idx,]

In [None]:
x

# boolean vector indicating which elements of the first column
# are greater than two (i.e. which rows that have a value greater
# than two in the first column)
x[,1]>2

# the rows in which the first element is greater than two
x[x[,1]>2,]

#### Lists

In [None]:
x = list(1:5, matrix(c(3,6,7,1), nrow=2), factor(c('a','b','c')))
x

In [None]:
# a list containing the second element
x[2]

In [None]:
# an index vector
idx = c(1,3)

# a list containing the first and third elements
x[idx]

In [None]:
# the first element of the list
# (which may or may not be a list itself)
x[[1]]

#### Data frames

In [None]:
v1 = 1:3
v2 = 11:13
v3 = factor(c('lo','lo','hi'))
x = data.frame(v1,v2,v3)
x

In [None]:
# a row is returned as a data frame because
# there are multiple variables
x[1,]

In [None]:
# a column is returned as a vector
x[,1]

# as above but with the list syntax
x[[1]]

#### Operations

In [None]:
y = c(20,26,22,31,16)
y

In [None]:
# the indices of the elements in ascending order
idx = order(y)
idx

# the elements are selected in the given order
# (ascending order in this case)
y[idx]

In [None]:
# the ages of five persons
ages = c(20,26,22,31,16)
ages

# the weights of the same five persons
weights = c(80,76,54,92,60)
weights

In [None]:
# the weights of those who are younger than 25 years
weights[ages<25]

In [None]:
x = 11:15
x

# replace the even numbers with zero
x[x%%2==0] = 0
x

In [None]:
y = -3:3
y

# replace the negative values with NA
y[y<0] = NA
y

## Named indices

#### Vectors

In [None]:
# an integer vector
x = 1:3
x

In [None]:
# a vector containing the index names
# (NULL because no names assigned)
names(x)

In [None]:
# assign index names
names(x) = c('a','b','c')

# index names are shown above the values
x

# the name vector is a string vector
names(x)

In [None]:
# the element named 'c'
x['c']

In [None]:
# a vector of named indices
idx = c('a','c')

# the elements 'a' and 'c'
x[idx]

#### Matrices

In [None]:
v1 = 1:3
v2 = 11:13
x = cbind(v1,v2)
x

In [None]:
# the index name vectors
# (as a list, one element for each dimension)
# column names are automatically derived from the original variable names
dimnames(x)

In [None]:
# assign index names for both dimensions
dimnames(x) = list(c('r1','r2','r3'), c('c1','c2'))

# index names are shown in the margins
x

In [None]:
# the column named 'c1'
# (note how the index names are carried through slicing)
x[,'c1']

#### Lists

In [None]:
x = list(1:5, matrix(c(3,6,7,1), nrow=2), factor(c('a','b','c')))
x

In [None]:
# assign names to the list indices
names(x) = c('first', 'second', 'third')

# the index names are prefixed by $
x

In [None]:
# a vector of named indices
idx = c('first', 'third')

# the slice specified with index names
x[idx]

In [None]:
# the first element of the list
x[['first']]

# as above but with the $ notation
x$first

#### Data frames

In [None]:
v1 = 1:3
v2 = 11:13
v3 = factor(c('lo','lo','hi'))
x = data.frame(v1,v2,v3)
x

In [None]:
# data frames are automatically given index names
# (column names are automatically derived from the original variable names)
# (row names are simply the row numbers)
dimnames(x)

In [None]:
# the column named 'v1'
x$v1

## Subset() function

In [None]:
v1 = 1:3
v2 = 11:13
v3 = factor(c('lo','lo','hi'))
x = data.frame(v1,v2,v3)
x

# these variables are not needed anymore
rm(v1, v2, v3)

In [None]:
# select the rows in which 'v1' is greater than one
# ('v1' refers to index name, not the variable that used to contain the data)
subset(x, v1>1)

In [None]:
# select the rows in which 'v1' is greater
# than one and 'v2' less than 13
subset(x, v1>1 & v2<13)

In [None]:
# select the columns 'v1' and 'v2' from the rows
# in which 'v3' is 'lo'
subset(x, v3=='lo', c('v1','v2'))

## Data I/O

In [None]:
v1 = 1:3
v2 = 11:13
v3 = factor(c('lo','lo','hi'))
x = data.frame(v1,v2,v3)
x

In [None]:
# write the data frame into a file
# (default format: space-separated fields, double-quotes, row & column headers)
write.table(x, 'data.csv')

# [data.csv]
# "v1" "v2" "v3"
# "1" 1 11 "low"
# "2" 2 12 "low"
# "3" 3 13 "high"

In [None]:
# read the data frame from a file
x = read.table('data.csv')
x

In [None]:
# [data2.csv]
# 1, 11, 'low'
# 2, 12, 'low'
# 3, 13, 'high'

# the table is badly loaded because the specified (default)
# file format does not match the actual file format
# (incorrect field separator --> commas included in values --> wrong data types)
read.table('data2.csv')

In [None]:
# the table is correctly loaded by specifying the file format
# (comma-separated fields, single-quotes, no headers)
read.table('data2.csv', header=FALSE, sep=',', quote="'")