In [1]:
import xml.etree.ElementTree as et
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.wsd import lesk
import re, os, string, json

In [26]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

# i2b2 Data Preprocessing

In [2]:
i2b2_training_data_file = './data/i2b2/smokers_surrogate_train_all_version2.xml'
tree = et.parse(i2b2_training_data_file)
root = tree.getroot()
data = []
stopwords_list = stopwords.words('english') + list(string.punctuation)
for record in root.findall('RECORD'):
    text = record.find('TEXT').text.replace('\n', '', 1).lower()
    corpus = ''
    for sent in sent_tokenize(text):
        corpus += ' '.join([word for word in word_tokenize(sent) if word not in stopwords_list])
        corpus += '\n'

    data.append({
       'status': record.find('SMOKING').get('STATUS').lower(),
       'text': corpus
   })

In [3]:
i2b2_output_dir = './data/i2b2/smokers/'
if not os.path.exists(i2b2_output_dir):
    os.mkdir(i2b2_output_dir)

In [4]:
labels_file = open(i2b2_output_dir + 'labels.txt', 'w')
for i in range(len(data)):
    output_file = open(i2b2_output_dir + str(i+1) + '.txt', 'w')
    output_file.write(data[i]['text'])
    labels_file.write(data[i]['status']+'\n')
    output_file.close()
labels_file.close()

# Reuters Data Preprocessing

In [5]:
reuters_dir = './data/reuters/'
reuters_output_dir = 'processed_data'
if not os.path.exists(reuters_dir + reuters_output_dir):
    os.mkdir(reuters_dir + reuters_output_dir)

In [6]:
for dir in os.listdir(reuters_dir):
    if dir != reuters_output_dir:
        if not os.path.exists(reuters_dir + reuters_output_dir + '/' + dir):
            os.mkdir(reuters_dir + reuters_output_dir + '/' + dir)
        for filename in os.listdir(reuters_dir + dir):
            file = open(reuters_dir + dir + '/' + filename, 'r')
            corpus = file.read().lower().replace('\n', ' ')
            file.close()
            processed_corpus = ''
            for sent in sent_tokenize(corpus):
                processed_corpus += ' '.join([word for word in word_tokenize(sent) if word not in stopwords_list])
                processed_corpus += '\n'
            output_file = open(reuters_dir + reuters_output_dir + '/' + dir + '/' + filename + '.txt', 'w')
            output_file.write(processed_corpus)
            output_file.close()

# Reddit Data Preprocessing

In [18]:
reddit_data_file = './data/reddit/reddit_data.json'
reddit_output_file = './data/reddit/processed_reddit_data.json'

In [19]:
with open(reddit_data_file) as file:
    reddit_data = json.load(file)
print(type(reddit_data))

<class 'list'>


In [31]:
keywords_accepted_synset = {
        'smoke': ['n.02', 'n.07', 'v.01'],
        'smoker': ['n.01'],
        'smoking': ['n.01', 'v.01']
}

In [32]:
processed_reddit_data = []
for post in reddit_data:
    title = post['title']
    text = post['body']
    if len(text) >= 6:
        for keyword in keywords_accepted_synset.keys():
            if keyword in text:
                synset = lesk(text.split(), keyword)
                name = synset.name()
                if any([s in name for s in keywords_accepted_synset[keyword]]):
                    processed_text = ' '.join([word for word in word_tokenize(text) if word not in stopwords_list])
                    processed_reddit_data.append({
                        'subreddit': post['subreddit'],
                        'title': post['title'],
                        'url': post['url'],
                        'date_created': post['date_created'],
                        'body': processed_text
                    })
                    break
    else:
        for keyword in keywords_accepted_synset.keys():
            if keyword in title:
                synset = lesk(title.split(), keyword)
                name = synset.name()
                if any([s in name for s in keywords_accepted_synset[keyword]]):
                    processed_title = ' '.join([word for word in word_tokenize(title) if word not in stopwords_list])
                    processed_reddit_data.append({
                        'subreddit': post['subreddit'],
                        'title': processed_title,
                        'url': post['url'],
                        'date_created': post['date_created'],
                        'body': post['body']
                    })
                    break

In [33]:
with open(reddit_output_file, 'w') as output:
    json.dump(processed_reddit_data, output)