In [1]:
import xml.etree.ElementTree as et
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.wsd import lesk
import re, os, string, json

In [2]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()
reg = re.compile(rf'(([{re.escape(string.punctuation)}]){{2,}})')

# i2b2 Data Preprocessing

In [3]:
i2b2_training_data_file = './data/i2b2/smokers_surrogate_train_all_version2.xml'
tree = et.parse(i2b2_training_data_file)
root = tree.getroot()
data = {}
statuses = ['smoker', 'non-smoker']
for record in root.findall('RECORD'):
    patient_id = record.get('ID')
    text = record.find('TEXT').text.split('\n')
    status = record.find('SMOKING').get('STATUS').lower()
    corpus = ''
    for sent in text:
        row = ' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent) if word.lower() not in stopwords_list and not reg.match(word)])
        corpus += row
        corpus += '\n' if len(row) > 0 else ''
    
    if status not in data:
        data[status] = []
    data[status].append({
        'id': patient_id,
        'text': corpus
    })

In [4]:
i2b2_output_dir = './data/i2b2/smokers/'
if not os.path.exists(i2b2_output_dir):
    os.mkdir(i2b2_output_dir)
    for status in data:
        if not os.path.exists(i2b2_output_dir + status):
            os.mkdir(i2b2_output_dir + status)

In [5]:
for status in data:
    for record in data[status]:
        output_file = open(i2b2_output_dir + status + '/' + record['id'] + '.txt', 'w')
        output_file.write(record['text'])
        output_file.close()

# Reuters Data Preprocessing

In [6]:
reuters_dir = './data/reuters/'
reuters_output_dir = 'processed_data'
if not os.path.exists(reuters_dir + reuters_output_dir):
    os.mkdir(reuters_dir + reuters_output_dir)

In [7]:
for dir in os.listdir(reuters_dir):
    if dir != reuters_output_dir:
        if not os.path.exists(reuters_dir + reuters_output_dir + '/' + dir):
            os.mkdir(reuters_dir + reuters_output_dir + '/' + dir)
        for filename in os.listdir(reuters_dir + dir):
            file = open(reuters_dir + dir + '/' + filename, 'r')
            corpus = file.read().split('\n\n')
            file.close()
            prefix = corpus[0].split('\n')
            for field in prefix:
                if 'Subject:' in field:
                    prefix = field
                    break
            corpus[0] = prefix
            corpus = ' '.join(corpus)
            processed_corpus = ''
            for sent in sent_tokenize(corpus):
                processed_corpus += ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sent) if word not in stopwords_list and not reg.match(word)])
                processed_corpus += '\n'
            output_file = open(reuters_dir + reuters_output_dir + '/' + dir + '/' + filename + '.txt', 'w')
            output_file.write(processed_corpus)
            output_file.close()

# Reddit Data Preprocessing

In [5]:
reddit_data_file = './data/reddit/reddit_data.json'
reddit_output_file = './data/reddit/processed_reddit_data.json'

In [6]:
with open(reddit_data_file) as file:
    reddit_data = json.load(file)
print(type(reddit_data))

<class 'list'>


In [7]:
keywords_accepted_synset = {
        'smoke': ['n.02', 'n.07', 'v.01'],
        'smoker': ['n.01'],
        'smoking': ['n.01', 'v.01']
}

In [11]:
processed_reddit_data = []
for post in reddit_data:
    title = post['title']
    text = post['body']
    url = post['url']
    processed_text = ''
    processed_title = ''
    if len(text) >= 6:
        for keyword in keywords_accepted_synset.keys():
            if keyword in text:
                synset = lesk(text.split(), keyword)
                name = synset.name()
                if any([s in name for s in keywords_accepted_synset[keyword]]):
                    for sent in sent_tokenize(text):
                        row = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stopwords_list and not reg.match(word)])
                        processed_text += row
                        processed_text += '\n' if len(row) > 0 else ''
                    processed_reddit_data.append({
                        'subreddit': post['subreddit'],
                        'title': post['title'],
                        'url': post['url'],
                        'date_created': post['date_created'],
                        'body': processed_text
                    })
                    break
    else:
        for keyword in keywords_accepted_synset.keys():
            if keyword in title:
                synset = lesk(title.split(), keyword)
                name = synset.name()
                if any([s in name for s in keywords_accepted_synset[keyword]]):
                    if url and not text:
                        processed_text = 'url'
                        processed_reddit_data.append({
                            'subreddit': post['subreddit'],
                            'title': post['title'],
                            'url': post['url'],
                            'date_created': post['date_created'],
                            'body': processed_text
                        })
                    else:
                        for sent in sent_tokenize(title):
                            row = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(title) if word not in stopwords_list and not reg.match(word)])
                            processed_title += row
                            processed_title += '\n' if len(row) > 0 else ''
                        processed_reddit_data.append({
                            'subreddit': post['subreddit'],
                            'title': processed_title,
                            'url': post['url'],
                            'date_created': post['date_created'],
                            'body': post['body']
                        })
                    break

In [12]:
with open(reddit_output_file, 'w') as output:
    json.dump(processed_reddit_data, output)