# BST281 Final Project Pipeline

Group 2  
Dongyuan Song, Siquan Wang, Xutao Wang, Linying Zhang

## Set Up
Import packages; set working direcotries.

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
rcParams['font.sans-serif'] = 'Arial'
import warnings
warnings.filterwarnings("ignore")
import urllib3
urllib3.disable_warnings()
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from IPython.display import FileLinks

In [None]:
current_path = os.getcwd()
print(current_path)

Set working directory. Default is this package folder.

In [None]:
os.chdir(current_path)

Enable using R in Jupyter notebook.

In [24]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## RNA-seq analysis

In [None]:
expr_df = pd.read_csv("expressionFile_counts_MM.csv")

In [None]:
expr_df = expr_df.set_index(expr_df.columns[0])
expr_df.head()

### Quality Control
Filter out none or low expressed genes.

In [None]:
print(expr_df.shape)

Here we only keep genes which counts are larger than 1 in each samples.

In [None]:
mask_low_vals = (expr_df > 0).sum(axis=1) == 6
expr_df = expr_df.loc[mask_low_vals, :]
print(expr_df.shape)

Save the result in working directory.

In [None]:
expr_df.to_csv('filtered.tsv',sep='\t')

### Normalization and Differential Expression Analysis

This step was finished in R. Use Bioconductor Package *edgeR*, *limma* and *DEseq2*.

In [25]:
%%R

setwd("C:/Users/songdongyuan/group02_final_project_packet")

#source("https://bioconductor.org/biocLite.R")

#R3.3.5 has issues when installing data.table
#install.packages("https://socialsciences.mcmaster.ca/jfox/.Pickup/data.table_1.10.4-3.zip",
                 #repos=NULL, type="win.binary")

#biocLite("edgeR")
#biocLite("limma")
#biocLite("Glimma")
#biocLite("org.Mm.eg.db")
#biocLite("RColorBrewer")
#biocLite("DESeq2")
#biocLite("DEFormats")



library(edgeR)
library(limma)
library(Glimma)
library(gplots)
library(org.Mm.eg.db)
library(RColorBrewer)
library(DESeq2)
library(DEFormats)

filtered <- read.csv("filtered.tsv", sep = "\t", row.names = 1, header= TRUE, stringsAsFactors = F)
filtered <- filtered[, c(4,2,6, 3,1,5)]

group <- c("Co", "Co", "Co", "Mono", "Mono", "Mono")
names(filtered) <- c("MM_HS5", "RPMI_HS5", "KMS11_HS5", "MM", "RPMI", "KMS11")

filtered_counts <- DGEList(filtered, group = group)
filtered_counts$samples$lib.size
barplot(filtered_counts$samples$lib.size,names=colnames(filtered_counts),las=2)
# Add a title to the plot
title("Barplot of library sizes")

# Get log2 counts per million
logcounts <- cpm(filtered_counts,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")
# MA plot
par(mfrow = c(2,3))
maPlot(filtered_counts$counts[,1], filtered_counts$counts[,2], lowess = T)
title("MA plot (unnormalised) of #1 & #2")
maPlot(filtered_counts$counts[,1], filtered_counts$counts[,3], lowess = T)
title("MA plot (unnormalised) of #1 & #3")
maPlot(filtered_counts$counts[,2], filtered_counts$counts[,3], lowess = T)
title("MA plot (unnormalised) of #2 & #3")
maPlot(filtered_counts$counts[,4], filtered_counts$counts[,5], lowess = T)
title("MA plot (unnormalised) of #4 & #5")
maPlot(filtered_counts$counts[,4], filtered_counts$counts[,6], lowess = T)
title("MA plot (unnormalised) of #4 & #6")
maPlot(filtered_counts$counts[,5], filtered_counts$counts[,6], lowess = T)
title("MA plot (unnormalised) of #5 & #6")

# Apply normalisation to DGEList object
filtered_counts_n <- calcNormFactors(filtered_counts, method = "TMM")

par(mfrow = c(2,3))
maPlot(filtered_counts_n$counts[,1], filtered_counts_n$counts[,2], lowess = T)
title("MA plot (normalised with TMM) of #1 & #2")
maPlot(filtered_counts_n$counts[,1], filtered_counts_n$counts[,3], lowess = T)
title("MA plot (normalised with TMM) of #1 & #3")
maPlot(filtered_counts_n$counts[,2], filtered_counts_n$counts[,3], lowess = T)
title("MA plot (normalised with TMM) of #2 & #3")
maPlot(filtered_counts_n$counts[,4], filtered_counts_n$counts[,5], lowess = T)
title("MA plot (normalised with TMM) of #4 & #5")
maPlot(filtered_counts_n$counts[,4], filtered_counts_n$counts[,6], lowess = T)
title("MA plot (normalised with TMM) of #4 & #6")
maPlot(filtered_counts_n$counts[,5], filtered_counts_n$counts[,6], lowess = T)
title("MA plot (normalised with TMM) of #5 & #6")


par(mfrow = c(1, 2))
# Get log2 counts per million
logcounts <- cpm(filtered_counts,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")
# Get log2 counts per million
logcounts <- cpm(filtered_counts_n,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2)
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (TMM normalised)")



# Using limma

# Create design matrix
design <- model.matrix(~ 0 + group)

# Fit limma
logCPM <- cpm(filtered_counts_n, log=TRUE, prior.count=1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend=TRUE)

# Using DESeq2

filtered_counts_n$counts <- round(filtered_counts_n$counts)

# You can easily convert data format between edgeR and DESeq2
dds <- as.DESeqDataSet(filtered_counts_n)
res <- DESeq(dds)
res <- results(res)
summary(res)

resSig <- res[ which(res$padj < 0.1 ), ]

DEgene_list <- rownames(resSig)
write.table(DEgene_list, file = "DEgene_list.tsv", row.names = FALSE, sep = '\t', col.names = F)




PermissionError: [WinError 32] 另一个程序正在使用此文件，进程无法访问。: 'C:\\Users\\SONGDO~1\\AppData\\Local\\Temp\\tmpefxxz94o\\Rplots001.png'

In [28]:
!Rscript RNA_seq.R $WORKDIR

Error in library(gplots) : 不存在叫'gplots'这个名字的程辑包
停止执行


## Mint-ChIP analysis

### Quality Control

The input Mint-ChIP files are BAM file already. Use **fastqc** to do quality control.

In [8]:
%%bash
sbatch fastqc.sh

#!/bin/bash
#SBATCH -p general
#SBATCH -J fastqc
#SBATCH -n 4
#SBATCH -N 1
#SBATCH -t 0-10:00
#SBATCH --mem 8000
#SBATCH -o fastqc.out
#SBATCH -e fastqc.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=dsong@hsph.harvard.edu

cd /n/home08/songdongyuan/BST281/

source new-modules.sh
module load fastqc/0.11.5-fasrc01

fastqc -o ~/BST281/fastqc_output -t 16 
~/BST281/chip/Alignment_Post_Processing_15005.bam 
~/BST281/chip/Alignment_Post_Processing_15009.bam 
~/BST281/chip/Alignment_Post_Processing_15022.bam 
~/BST281/chip/Alignment_Post_Processing_15175.bam 
~/BST281/chip/Alignment_Post_Processing_15180.bam 
~/BST281/chip/Alignment_Post_Processing_15193.bam 
~/BST281/chip/Alignment_Post_Processing_15223.bam 
~/BST281/chip/Alignment_Post_Processing_15280.bam

-bash: line 13: cd: /n/home08/songdongyuan/BST281/: No such file or directory
-bash: line 15: new-modules.sh: No such file or directory
-bash: line 16: module: command not found
-bash: line 18: fastqc: command not found
-bash: line 19: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15005.bam: No such file or directory
-bash: line 20: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15009.bam: No such file or directory
-bash: line 21: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15022.bam: No such file or directory
-bash: line 22: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15175.bam: No such file or directory
-bash: line 23: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15180.bam: No such file or directory
-bash: line 24: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15193.bam: No such file or directory
-bash: line 25: /home/songdongyuan/BST281/chip/Alignment_Post_Processing_15223.bam: No such file or directory
-bash: lin

Show the fastqc reports.

In [15]:
FileLinks(os.path.join('./fastqc_output'), included_suffixes=['.html'])

The reports show that the quality is fine. Use the BAM file for next step.

### Peak Calling

Use MACS2 do peak calling. Notice some parameters: file type is BAMPE, q = 0.01.

In [17]:
%%bash
sbatch MACS2.sh

#!/bin/bash
#SBATCH -p general
#SBATCH -J macs2
#SBATCH -n 4
#SBATCH -N 1
#SBATCH -t 0-10:00
#SBATCH --mem 8000
#SBATCH -o macs2.out
#SBATCH -e macs2.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=dsong@hsph.harvard.edu

cd /n/home08/songdongyuan/BST281/

source new-modules.sh
module load macs2/2.1.1.20160309-fasrc01

macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15005.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15005 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15009.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15009 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15022.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15022 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15175.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15175 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15180.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15180 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15193.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15193 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15223.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15223 -q 0.01
macs2 callpeak -t ~/BST281/chip/Alignment_Post_Processing_15280.bam --outdir ~/BST281/macs2_output -f BAMPE -g hs -n 15280 -q 0.01

-bash: line 1: sbatch: command not found
-bash: line 15: cd: /n/home08/songdongyuan/BST281/: No such file or directory
-bash: line 17: new-modules.sh: No such file or directory
-bash: line 18: module: command not found
-bash: line 20: macs2: command not found
-bash: line 21: macs2: command not found
-bash: line 22: macs2: command not found
-bash: line 23: macs2: command not found
-bash: line 24: macs2: command not found
-bash: line 25: macs2: command not found
-bash: line 26: macs2: command not found
-bash: line 27: macs2: command not found


In [18]:
FileLinks(os.path.join('./macs2_output'), included_suffixes=['.xls'])

### Differential Binding Analysis