In [1]:
import pandas as pd
import os 
import csv
import re

In [2]:
#reading in bacteria list

with open('bac_list.csv') as f:
    reader = csv.reader(f)
    blist = list(reader)

bacteria = []

for bac in blist[2:]:
    if len(bac) > 0:
         bacteria += bac[0].split(' ')

In [3]:
# fxn to truncate at references section

def refclean(doc):
    
    citepat1 = re.compile('(1\.\s [A-Z].*\n.*\n)')
    citepat2 = re.compile('(\nREFERENCES\n)', re.IGNORECASE)

    temp = citepat2.split(doc, maxsplit=1)

    if len(temp) > 1:
        cleaned = temp[0]
        
    else:
        temp2 = citepat1.split(doc)

        if len(temp2) > 1:
            if len(temp2) > 2:
                cleaned = ' '.join(temp[:-1])
            else:
                cleaned = temp2[0]
        else:
            cleaned = temp2
            
    return cleaned

#fxn to remove urls

def urlclean(doc):
    
    urlpat = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    try:
        cleaned = re.sub(urlpat, ' ', doc)
    except:
        cleaned = doc
        
    return cleaned
    

# fxn to break into paragraphs

def paras(doc): 
    
    paras = doc.split('\n\n')
    paras = [para for para in paras if (len(para) > 50)]
    
    return paras
    

# fxn to only keep paragraphs w/terms appearing in bacteria list
    
def bacfilter(paras, blist):
    
    tokeep = []
    
    for para in paras:
        temp = para.lower()
        if any(check in temp.split(' ') for check in blist):
            tokeep.append(para)
    
    return tokeep

In [4]:
# Reading in and processing raw .txt files then writing filtered & cleaned to file
txtfiles = os.listdir("TxtData")


for txtfile in txtfiles:
    
    with open("TxtData/"+ txtfile, "r") as f:
        doc = f.read()
    
    
    clean1 = refclean(doc)
    clean2 = urlclean(clean1)

    try:
        topara = paras(clean2)
    except:
        print(txtfile + 'no paras')
        continue
        
  
    filtered = bacfilter(topara, bacteria)
    
    filtdoc = "\n".join(filtered)
    
    path = "Filtered0/" + txtfile[:-4] + "_FILTERED.txt"
    
    with open(path, "w") as f:
        f.write(filtdoc)


inline-supplementary-material-1.txtno paras
ML_16S rRNA gene sequencing using a k-mer based representation of shallow sub-samples.txtno paras
The gut microbiome of healthy long‐living people.txtno paras
Pre-Eclampsia- microbiota possibly playing a role.txtno paras
Estrogen-mediated gut microbiome alterations influence sexual dimorphism in metabolic syndrome in mice.txtno paras
METAGENOMICS- Genomic Analysis of Microbial Communities.txtno paras
nihms900158_A Gradient Boosting Machine for Hierarchically Clustered Data.txtno paras
Nature Medicine Volume issue 2019 [doi 10.1038_s41591-019-0483-6] Dominguez-Bello, Maria Gloria -- Gestational shaping of the maternal vaginal microbiome (1).txtno paras
