# Planning your experiment – statistical power, sample size, replicates…

# Collecting the data – wrangling…

### Tidyverse basics

In [None]:
# Install the tidyverse
install.packages("tidyverse")

In [None]:
# Import the Tidyverse
library(tidyverse)

# Import data
passengers <- read.csv("data/train.csv")

In [None]:
# Check out the first several observations of your dataframe
passengers

### Pipes


To make R code more human readable, the Tidyverse tools use the pipe, %>%, which was acquired from the ‘magrittr’ package and comes installed automatically with Tidyverse. The pipe allows the output of a previous command to be used as input to another command instead of using nested functions.


In [None]:
# Check out the first several observations of your dataframe USINF A PIPE
passengers %>%
  summary()

HINT: you can concatenate pipes!

In [None]:
# Check out the first several observations of your dataframe after dropping observations that have missing values
passengers %>%
  drop_na() %>%
  summary()

### Wrangle your Data - using filter(), arrange() and mutate()


In [None]:
# choose a particular set of observations, say, those for which the "Sex" was 'female'
passengers %>%
  filter(Sex == "female")

In [None]:
# sort your observations by increasing 'Fare' to see if you can notice any trends
passengers %>%
  arrange(Fare)

# NOTE: by default, arrange() sorts in increasing order

# Arrange by decreasing Fare
passengers %>%
  arrange(desc(Fare))

In [None]:
# feature engineering: create a new variable by adding two existing variables from the dataset
# 1 - 'Parch' is the number of parents and children
# 2 - 'SibSp' is the number of siblings and spouses

# You can add these together to get a new variable 'FamSize' (will be added as an extra columna at the end of the dataset)

passengers %>%
  mutate(FamSize = Parch + SibSp)

### Summarizing and Grouping your Data

In [None]:
# Use summarise() to find out the mean fare paid
passengers %>%
  summarise(meanFare = mean(Fare))

In [None]:
# Use summarise() to find out the median fare paid
passengers %>%
  summarise(medianFare = median(Fare))

In [None]:
# Use the filter() and summarise() together to find out the mean fare paid among men:
passengers %>%
  filter(Sex == "male") %>%
  summarise(meanFare = mean(Fare))

In [None]:
# Use filter() and summarise() to find out the mean fare paid among women and how many women survived:
passengers %>%
  filter(Sex == "female") %>%
  summarise(meanFare = mean(Fare), numSurv = sum(Survived))

In [None]:
# Use group_by() and summarise() to find the mean fare and number of survivors as a function of sex:
passengers %>%
  group_by(Sex) %>%
  summarise(meanFare = mean(Fare), numSurv = sum(Survived))

# Processing the data – transformation, normalization…

In [None]:
# log-transformation
data = c(1200,34567,3456,12,3456,0985,1211)
summary(data)

log_scale = log(as.data.frame(data))

In [None]:
# normalisation with standard scaling in R
data = c(1200,34567,3456,12,3456,0985,1211)
summary(data)

scale_data <- as.data.frame(scale(data))

# Exploring the data – distribution, outliers…

In [None]:
# create a dataset with exam scores
scores <- c(70, 85, 80, 90, 60, 55, 100, 120, 75, 85)

In [None]:
# To detect outliers using the upper bound, calculate the upper bound as the third quartile plus 1.5 times the interquartile range (IQR):
q3 <- quantile(scores, 0.75)
iqr <- IQR(scores)
upper_bound <- q3 + 1.5*iqr

In [None]:
# find which scores are above this upper bound as outliers:
outliers_upper <- scores[scores > upper_bound]
outliers_upper

In [None]:
# do the same for lower_bound
q1 <- quantile(scores, 0.25)
lower_bound <- q1 - 1.5*iqr

In [None]:
# find which scores are below this lower bound as outliers:
outliers_lower <- scores[scores < lower_bound]
outliers_lower

In [None]:
# Detect outliers using the IQR method, which is based on the difference between the first and third quartiles.

# Any score that falls outside the range of Q1–1.5IQR and Q3 + 1.5IQR is considered an outlier.

outliers_iqr <- scores[scores < q1 - 1.5*iqr | scores > q3 + 1.5*iqr]
outliers_iqr

# Analysing the data - statistical testing, correlation...

###  Compute correlation in R

In [None]:
# cor(x, y, method = c("pearson", "kendall", "spearman"))
# cor.test(x, y, method=c("pearson", "kendall", "spearman"))

**cor()** computes the correlation coefficient

**cor.test()** test for association/correlation between paired samples. It returns both the correlation coefficient and the significance level(or p-value) of the correlation

In [None]:
# import built-in dataset
my_data <- mtcars
head(my_data, 6)

In [None]:
# compute correlations
cor_pearson <- cor.test(my_data$wt, my_data$mpg,  method = "pearson")
cor_spearman <- cor.test(my_data$wt, my_data$mpg,  method = "spearman")
cor_kendall <- cor.test(my_data$wt, my_data$mpg,  method = "kendall")

### statistical testing - biological example

In [None]:
library("rWSBIM1322")

In [None]:
data(tdata1)
head(tdata1)

# tdata1 dataset from the rWSBIM1322 package that provide gene expression data for 100 genes and 6 samples, three in group A and 3 in group B.

In [None]:
# log-transform the data
x <- log_tdata1[73, ]

# apply a t-test to feature (row) 73, comparing the expression intensities in groups A and B
t.test(x[1:3], x[4:6])