-
Notifications
You must be signed in to change notification settings - Fork 0
/
HMP data pull and processing.R
142 lines (97 loc) · 3.2 KB
/
HMP data pull and processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
###############
# Data selection and extraction
###############
install.packages("BiocManager") #one-time installation
BiocManager::install("HMP16SData") #one-time installation
library(HMP16SData)
V35_stool <-
V35() %>%
subset(select = HMP_BODY_SUBSITE == "Stool")
V35_stool_phyloseq <- as_phyloseq(V35_stool)
knitr::opts_chunk$set(message = FALSE)
library(HMP16SData)
library(phyloseq)
library(magrittr)
library(ggplot2)
library(tibble)
library(dplyr)
library(dendextend)
library(circlize)
library(curatedMetagenomicData)
library(gridExtra)
library(cowplot)
library(readr)
library(haven)
###################
#This should export the subset data into the file types I need
V35_participant_data <-
V35() %>%
colData() %>%
as.data.frame() %>%
rownames_to_column(var = "PSN")
V35_OTU_counts <-
V35() %>%
assay() %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "PSN")
V35_data <-
merge.data.frame(V35_participant_data, V35_OTU_counts, by = "PSN")
V35_dictionary <-
V35() %>%
rowData() %>%
t.data.frame() %>%
as.data.frame()
colnames(V35_data) <-
colnames(V35_data) %>%
gsub(pattern = "\\.", replacement = "_", x = .)
colnames(V35_dictionary) <-
colnames(V35_dictionary) %>%
gsub(pattern = "\\.", replacement = "_", x = .)
#Saving as external files
library(foreign)
write.foreign(V35_data, "hmpdata.txt", "hmpdata.sas", package="SAS")
write.foreign(V35_dictionary, "hmptax.txt", "hmptax.sas", package="SAS")
##################
##############
#HMP Data Introduction
#######################
library(readxl)
library(data.table)
library(magrittr)
library(plyr)
library(dplyr)
####Read in HMP data
hmp <- read_excel("hmpdata.txt")
setDT(hmp)
##get total numbers of observations. original number of observations was 4743
tmp = hmp[HMP_BODY_SUBSITE=="Stool"]
#219 stool samples total
test <- count(tmp, vars="RSID", wt_var="VISITNO")
m1 <- tmp %>% group_by(RSID) %>% filter(row_number()==1)
#in total, there are now 210 FIRST stool samples (which can be from visit 1 or visit 2)
#16384 varialbes, minus the demographics--so now there were
# > table(m1$VISITNO)
#
# 1 2
# 191 19
# % DETECT--we want 10%+ detect
x <- colnames(tmp)
m1[m1 == 0] <- NA
#this checks % non-detect in each column. we are going for anything 90% or less non-detect (or 10%+ detect)
m2 <- m1[lapply( m1[1:210,], function(x) sum(is.na(x)) / length(x) ) < 0.9 ]
m2[is.na(m2)]=0
#to double check that the % detect is working, and it is checking %detect at 10%+
test_detect <- m1[lapply( m1[1:210,], function(x) sum(!is.na(x)) / length(x) ) > 0.1 ]
m2 %>%
select(everything()) %>% # replace to your needs
summarise_all(funs(sum(is.na(.))))
##maybe test this, above, with the remaining 10% to see if they actually are >90% nondetect
##find just the character vars to figure out exactly how many OTUs there are
tmp %>%
select_if(is.character) %>%
head(2)
#5, below, and I think also PSID and RSID so 7 total--so 16384-7=16377
# SEX RUN_CENTER HMP_BODY_SITE HMP_BODY_SUBSITE SRS_SAMPLE_ID
#export dataset
write.csv(m2, file=, row.names=FALSE)