In [1]:
from nltk import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.similarities import SparseTermSimilarityMatrix, SparseMatrixSimilarity, SoftCosineSimilarity
from gensim.matutils import softcossim
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from statistics import stdev, mean
import pandas as pd
import numpy as np
import os, json

In [2]:
socialsent_smoke_filepath = './lexicons/subreddits/electronic_cigarette.tsv'
socialsent_askscience_filepath = './lexicons/subreddits/askscience.tsv'
socialsent_science_filepath = './lexicons/subreddits/science.tsv'

socialsent_df = pd.read_csv(socialsent_smoke_filepath, sep='\t', names=['word', 'avg_sent_score', 'std_sent_score'])
print('burnt' in socialsent_df['word'].values)

True


In [3]:
def read_files(directory):
    word2vec_texts = {}
    doc2vec_texts = {}
    for dir in os.listdir(directory):
        word2vec_texts[dir] = []
        doc2vec_texts[dir] = []
        for root, _, files in os.walk(os.path.join(directory, dir)):
            for file in files:
                open_file = open(os.path.join(root, file), 'r')
                text = open_file.read()
                open_file.close()
                word2vec_texts[dir] += [word_tokenize(sentence) for sentence in text.split('\n') if len(word_tokenize(sentence)) > 0]
                doc2vec_texts[dir].append(text.replace('\n', ' '))
    return word2vec_texts, doc2vec_texts

In [4]:
def read_raw_files(directory):
    raw_texts = {}
    for dir in os.listdir(directory):
        raw_texts[dir] = []
        for root, _, files in os.walk(os.path.join(directory, dir)):
            for file in files:
                open_file = open(os.path.join(root, file), 'r')
                text = open_file.read()
                open_file.close()
                raw_texts[dir] += [word_tokenize(sentence) for sentence in text.split('\n') if len(word_tokenize(sentence)) > 0]
    return raw_texts

In [5]:
def build_unigram_model(texts):
    unigrams = {}
    for subset in texts:
        unigrams[subset] = list(ngrams([word for sentence in texts[subset] for word in sentence], 1))
    return unigrams

In [6]:
def evaluate_unigram_model(unigrams, tagged_words):
    common_words = {}
    common_tagged_words = {}
    for subset1 in unigrams:
        common_words[subset1] = {}
        common_tagged_words[subset1] = {}
        counter1 = Counter(unigrams[subset1])
        counter1_tagged = Counter(tagged_words[subset1])
        subset1_top100 = [word[0][0] for word in counter1.most_common(100)]
        subset1_top100_tagged = [word[0] for word in counter1_tagged.most_common(100)]
        for subset2 in unigrams:
            counter2 = Counter(unigrams[subset2])
            counter2_tagged = Counter(tagged_words[subset2])
            subset2_top100 = [word[0][0] for word in counter2.most_common(100)]
            subset2_top100_tagged = [word[0] for word in counter2_tagged.most_common(100)]
            common_words[subset1][subset2] = [[word for word in subset1_top100 if word in subset2_top100]]
            common_tagged_words[subset1][subset2] = [[word for word in subset1_top100_tagged if word in subset2_top100_tagged]]
    senti_scores = {}
    for subset in common_tagged_words:
        senti_scores[subset], _ = sentiwordnet_scores(common_tagged_words[subset], True)
    return common_words, common_tagged_words, senti_scores

In [7]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return ''

In [8]:
def get_mpqa_pos(tag):
    if tag.startswith('J'):
        return 'adj'
    elif tag.startswith('V'):
        return 'verb'
    elif tag.startswith('N'):
        return 'noun'
    elif tag.startswith('R'):
        return 'adverb'
    else:
        return 'anypos'

In [9]:
def sentiwordnet_scores(texts, tagged=False):
    senti_scores = {}
    word_list = {}
    for subset in texts:
        senti_scores[subset] = {
            'total_positive_score': 0,
            'total_negative_score': 0,
            'total_objective_score': 0,
            'average_positive_score': 0,
            'std_dev_positive_score': 0,
            'average_negative_score': 0,
            'std_dev_negative_score': 0,
            'average_objective_score': 0,
            'std_dev_objective_score': 0,
            'overall_score': 0,
            'total_num_objective_posts': 0,
            'total_num_words': 0,
            'positive_count': 0,
            'negative_count': 0,
            'objective_count': 0,
            'positive_ratio': 0,
            'negative_ratio': 0
        }
        pos_scores = []
        neg_scores = []
        obj_scores = []
        word_list[subset] = []
        for line in texts[subset]:
            if not tagged:
                tagged_line = pos_tag(line)
            else:
                tagged_line = line
            for tag in tagged_line:
                synsets = swn.senti_synsets(tag[0], get_wordnet_pos(tag[1])) if get_wordnet_pos(tag[1]) != '' else []
                synsets_list = list(synsets)
                if len(synsets_list) > 0:
                    word_list[subset].append(tag)
                    senti_scores[subset]['total_num_words'] += 1
                    synset = synsets_list[0]
                    pos_score = synset.pos_score()
                    neg_score = synset.neg_score()
                    obj_score = synset.obj_score() 
                    if pos_score == max([pos_score, neg_score, obj_score]): 
                        senti_scores[subset]['positive_count'] += 1
                        senti_scores[subset]['total_positive_score'] += pos_score
                        pos_scores.append(pos_score)
                    elif neg_score == max([pos_score, neg_score, obj_score]):
                        senti_scores[subset]['negative_count'] += 1
                        senti_scores[subset]['total_negative_score'] += neg_score
                        neg_scores.append(neg_score)
                    elif obj_score == max([pos_score, neg_score, obj_score]):
                        senti_scores[subset]['objective_count'] += 1
                        senti_scores[subset]['total_objective_score'] += obj_score
                        obj_scores.append(obj_score)

        senti_scores[subset]['average_positive_score'] = round(senti_scores[subset]['total_positive_score'] / senti_scores[subset]['total_num_words'], 4)
        senti_scores[subset]['average_negative_score'] = round(senti_scores[subset]['total_negative_score'] / senti_scores[subset]['total_num_words'], 4)
        senti_scores[subset]['average_objective_score'] = round(senti_scores[subset]['total_objective_score'] / senti_scores[subset]['total_num_words'], 4)
        if len(pos_scores) > 2:
            senti_scores[subset]['std_dev_positive_score'] = round(stdev(pos_scores), 4)
        else:
            senti_scores[subset]['std_dev_positive_score'] = 0
        if len(neg_scores) > 2:
            senti_scores[subset]['std_dev_negative_score'] = round(stdev(neg_scores), 4)
        else:
            senti_scores[subset]['std_dev_negative_score'] = 0
        if len(obj_scores) > 2:
            senti_scores[subset]['std_dev_objective_score'] = round(stdev(obj_scores), 4)
        else:
            senti_scores[subset]['std_dev_objective_score'] = 0
        senti_scores[subset]['overall_score'] = senti_scores[subset]['average_negative_score'] - senti_scores[subset]['average_positive_score']
        senti_scores[subset]['positive_ratio'] = round(senti_scores[subset]['positive_count'] / senti_scores[subset]['total_num_words'], 4)
        senti_scores[subset]['negative_ratio'] = round(senti_scores[subset]['negative_count'] / senti_scores[subset]['total_num_words'], 4)
        
    return senti_scores, word_list

In [10]:
def mpqa_scores(texts, tagged=False):
    mpqa_filepath = './lexicons/mpqa/mpqa.json'
    with open(mpqa_filepath, 'r') as input:
        mpqa = json.load(input)
    mpqa_scores = {}
    word_list = {}
    for subset in texts:
        mpqa_scores[subset] = {
            'total_positive_score': 0,
            'total_negative_score': 0,
            'total_objective_score': 0,
            'average_positive_score': 0,
            'std_dev_positive_score': 0,
            'average_negative_score': 0,
            'std_dev_negative_score': 0,
            'average_objective_score': 0,
            'std_dev_objective_score': 0,
            'overall_score': 0,
            'total_num_objective_posts': 0,
            'total_num_words': 0,
            'positive_count': 0,
            'negative_count': 0,
            'objective_count': 0,
            'positive_ratio': 0,
            'negative_ratio': 0
        }
        pos_scores = []
        neg_scores = []
        obj_scores = []
        word_list[subset] = []
        for line in texts[subset]:
            if not tagged:
                tagged_line = pos_tag(line)
            else:
                tagged_line = line
            for tag in tagged_line:
               pos = get_mpqa_pos(tag[1])
               word = tag[0]
               if word in mpqa and (pos in mpqa[word] or 'anypos' in mpqa[word]):
                    word_list[subset].append(tag)
                    mpqa_scores[subset]['total_num_words'] += 1
                    pos_score = 0
                    neg_score = 0
                    obj_score = 0
                    if ('anypos' in mpqa[word] and mpqa[word]['anypos'] == 'positive') or (pos in mpqa[word] and mpqa[word][pos] == 'positive'):
                        pos_score = 1
                        mpqa_scores[subset]['total_positive_score'] += pos_score
                        pos_scores.append(pos_score)
                    elif ('anypos' in mpqa[word] and mpqa[word]['anypos'] == 'negative') or (pos in mpqa[word] and mpqa[word][pos] == 'negative'):
                        neg_score = 1
                        mpqa_scores[subset]['total_negative_score'] += neg_score
                        neg_scores.append(neg_score)
                    elif ('anypos' in mpqa[word] and mpqa[word]['anypos'] == 'neutral') or (pos in mpqa[word] and mpqa[word][pos] == 'neutral'):
                        obj_score = 1
                        mpqa_scores[subset]['total_objective_score'] += obj_score
                        obj_scores.append(obj_score)
                    elif ('anypos' in mpqa[word] and mpqa[word]['anypos'] == 'both') or (pos in mpqa[word] and mpqa[word][pos] == 'both'):
                        pos_score = 1 # always label the word as positive if both sentiment is possible
                        mpqa_scores[subset]['total_positive_score'] += pos_score
                        pos_scores.append(pos_score)

                    if pos_score == max([pos_score, neg_score, obj_score]): 
                        mpqa_scores[subset]['positive_count'] += 1
                    elif neg_score == max([pos_score, neg_score, obj_score]):
                        mpqa_scores[subset]['negative_count'] += 1
                    elif obj_score == max([pos_score, neg_score, obj_score]):
                        mpqa_scores[subset]['objective_count'] += 1

        mpqa_scores[subset]['average_positive_score'] = round(mpqa_scores[subset]['total_positive_score'] / mpqa_scores[subset]['positive_count'], 4)
        mpqa_scores[subset]['average_negative_score'] = round(mpqa_scores[subset]['total_negative_score'] / mpqa_scores[subset]['negative_count'], 4)
        mpqa_scores[subset]['average_objective_score'] = round(mpqa_scores[subset]['total_objective_score'] / mpqa_scores[subset]['objective_count'], 4)
        mpqa_scores[subset]['std_dev_positive_score'] = round(stdev(pos_scores), 4)
        mpqa_scores[subset]['std_dev_negative_score'] = round(stdev(neg_scores), 4)
        mpqa_scores[subset]['std_dev_objective_score'] = round(stdev(obj_scores), 4)
        mpqa_scores[subset]['overall_score'] = mpqa_scores[subset]['average_negative_score'] - mpqa_scores[subset]['average_positive_score']
        mpqa_scores[subset]['positive_ratio'] = round(mpqa_scores[subset]['positive_count'] / mpqa_scores[subset]['total_num_words'], 4)
        mpqa_scores[subset]['negative_ratio'] = round(mpqa_scores[subset]['negative_count'] / mpqa_scores[subset]['total_num_words'], 4)

    return mpqa_scores

In [11]:
def socialsent_scores(texts, collection='smoke'):
    socialsent_smoke_filepath = './lexicons/subreddits/electronic_cigarette.tsv'
    socialsent_askscience_filepath = './lexicons/subreddits/askscience.tsv'
    socialsent_science_filepath = './lexicons/subreddits/science.tsv'

    if collection == 'smoke':
        socialsent_df = pd.read_csv(socialsent_smoke_filepath, sep='\t', names=['word', 'avg_sent_score', 'std_sent_score'])
    elif collection == 'science':
        socialsent_df = pd.read_csv(socialsent_science_filepath, sep='\t', names=['word', 'avg_sent_score', 'std_sent_score'])

    socialsent_scores = {}
    word_list = {}
    for subset in texts:
        socialsent_scores[subset] = {
            'total_positive_score': 0,
            'total_negative_score': 0,
            'total_objective_score': 0,
            'average_positive_score': 0,
            'std_dev_positive_score': 0,
            'average_negative_score': 0,
            'std_dev_negative_score': 0,
            'average_objective_score': 0,
            'std_dev_objective_score': 0,
            'overall_score': 0,
            'total_num_objective_posts': 0,
            'total_num_words': 0,
            'positive_count': 0,
            'negative_count': 0,
            'objective_count': 0,
            'positive_ratio': 0,
            'negative_ratio': 0
        }
        pos_scores = []
        neg_scores = []
        obj_scores = []
        word_list[subset] = []
        for line in texts[subset]:
            for word in line:
               if word in socialsent_df['word'].values:
                    word_list[subset].append(word)
                    socialsent_scores[subset]['total_num_words'] += 1
                    pos_score = 0
                    neg_score = 0
                    obj_score = 0
                    sent_score = socialsent_df.loc[socialsent_df['word'] == word]['avg_sent_score'].values[0]
                    if abs(sent_score/10) < 0.1 :
                        obj_score = 1
                        socialsent_scores[subset]['total_objective_score'] += obj_score
                        socialsent_scores[subset]['objective_count'] += 1
                        obj_scores.append(obj_score)
                    elif sent_score > 0:
                        pos_score = sent_score/10
                        socialsent_scores[subset]['total_positive_score'] += pos_score
                        socialsent_scores[subset]['positive_count'] += 1
                        pos_scores.append(pos_score)
                    elif sent_score < 0:
                        neg_score = abs(sent_score/10)
                        socialsent_scores[subset]['total_negative_score'] += neg_score
                        socialsent_scores[subset]['negative_count'] += 1
                        neg_scores.append(neg_score)

        socialsent_scores[subset]['average_positive_score'] = round(socialsent_scores[subset]['total_positive_score'] / socialsent_scores[subset]['positive_count'], 4)
        socialsent_scores[subset]['average_negative_score'] = round(socialsent_scores[subset]['total_negative_score'] / socialsent_scores[subset]['negative_count'], 4)
        socialsent_scores[subset]['average_objective_score'] = round(socialsent_scores[subset]['total_objective_score'] / socialsent_scores[subset]['objective_count'], 4)
        socialsent_scores[subset]['std_dev_positive_score'] = round(stdev(pos_scores), 4)
        socialsent_scores[subset]['std_dev_negative_score'] = round(stdev(neg_scores), 4)
        socialsent_scores[subset]['std_dev_objective_score'] = round(stdev(obj_scores), 4)
        socialsent_scores[subset]['overall_score'] = socialsent_scores[subset]['average_negative_score'] - socialsent_scores[subset]['average_positive_score']
        socialsent_scores[subset]['positive_ratio'] = round(socialsent_scores[subset]['positive_count'] / socialsent_scores[subset]['total_num_words'], 4)
        socialsent_scores[subset]['negative_ratio'] = round(socialsent_scores[subset]['negative_count'] / socialsent_scores[subset]['total_num_words'], 4)

    return socialsent_scores

In [12]:
def build_word2vec_model_and_vocabulary(texts):
    whole_text = []
    vocabulary = {}
    
    for subset in texts:
        vocabulary[subset] = []
        for line in texts[subset]:
            whole_text.append(line)
            for word in line:
                if word not in vocabulary[subset]:
                    vocabulary[subset].append(word)
    cores = os.cpu_count()-1
    model = Word2Vec(sentences=whole_text,
                     min_count=1,
                     window=2,
                     size=300,
                     iter=40,
                     workers=cores)
    return model, vocabulary

In [13]:
def build_doc2vec_model(texts):
    tagged_data = []
    for subset in texts:
        tagged_data += [TaggedDocument(words=word_tokenize(sentence), tags=[subset]) for i, sentence in enumerate(texts[subset])]
    cores = os.cpu_count()-1
    model = Doc2Vec(size=300, alpha=0.025, min_alpha=0.00025, min_count=1, workers=cores)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=40)
    return model

In [14]:
def compute_word2vec_wmdistance(model, texts):
    model.init_sims(replace=True)
    wmdistances = {}
    data = {}
    for subset in texts:
        data[subset] = []
        for sentence in texts[subset]:
            data[subset] += word_tokenize(sentence)
    
    for subset1 in data:
        wmdistances[subset1] = {}
        for subset2 in data:
            wmdistances[subset1][subset2] = model.wv.wmdistance(data[subset1], data[subset2])
    
    return wmdistances

In [15]:
def compute_word2vec_soft_cosine_similarity(model, texts, subsets1, subsets2):
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    soft_cosine_similarities = {}
    data = {}
    common_data = []
    for subset in texts:
        data[subset] = []
        for sentence in texts[subset]:
            data[subset] += sentence
        common_data.append(data[subset])
    
    dictionary = Dictionary(common_data)
    bow_corpus = [dictionary.doc2bow(document) for document in common_data]
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)

    subsets = list(texts.keys())
    for subset1 in subsets1:
        soft_cosine_similarities[subset1] = {}
        # query = data[subset1]  # make a query
        # sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
        # print(sims)
        for subset2 in subsets2:
            soft_cosine_similarities[subset1][subset2] = similarity_matrix.inner_product(dictionary.doc2bow(data[subset1]), dictionary.doc2bow(data[subset2]), normalized=True)
    return soft_cosine_similarities

In [16]:
def compute_doc2vec_soft_cosine_similarity(model, texts, subsets1, subsets2):
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)
    soft_cosine_similarities = {}
    data = {}
    common_data = []
    for subset in texts:
        data[subset] = []
        for sentence in texts[subset]:
            data[subset] += word_tokenize(sentence)
        common_data.append(data[subset])
    
    dictionary = Dictionary(common_data)
    bow_corpus = [dictionary.doc2bow(document) for document in common_data]
    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
    docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=len(list(data.keys())))

    subsets = list(texts.keys())
    for subset1 in subsets1:
        soft_cosine_similarities[subset1] = {}
        # query = data[subset1]  # make a query
        # sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
        for subset2 in subsets2:
            soft_cosine_similarities[subset1][subset2] = similarity_matrix.inner_product(dictionary.doc2bow(data[subset1]), dictionary.doc2bow(data[subset2]), normalized=True)
    return soft_cosine_similarities

In [17]:
def compute_word2vec_cosine_similarity(model, vocabulary, subsets1, subsets2):
    cosine_similarities = {}
    vector_lists = {}
    for subset in vocabulary:
        for word in vocabulary[subset]:
            if word in model.wv.vocab:
                if subset not in vector_lists:
                    vector_lists[subset] = []
                vector_lists[subset] += list(model.wv[word])

    for subset1 in subsets1:
        cosine_similarities[subset1] = {}
        for subset2 in subsets2:
            original_vector_list1 = vector_lists[subset1].copy()
            original_vector_list2 = vector_lists[subset2].copy()
            if len(vector_lists[subset1]) < len(vector_lists[subset2]):
                vector_lists[subset1] += [0] * (len(vector_lists[subset2]) - len(vector_lists[subset1]))
            else:
                vector_lists[subset2] += [0] * (len(vector_lists[subset1]) - len(vector_lists[subset2]))
            cosine_similarities[subset1][subset2] = cosine_similarity([vector_lists[subset1]], [vector_lists[subset2]], dense_output=False)[0][0]
            vector_lists[subset1] = original_vector_list1
            vector_lists[subset2] = original_vector_list2
    return cosine_similarities

In [18]:
def compute_word2vec_cosine_similarity_test(model, vocabulary, subsets1, subsets2):
    cosine_similarities = {}
    vector_lists = {}
    for subset in vocabulary:
        for word in vocabulary[subset]:
            if word in model.wv.vocab:
                if subset not in vector_lists:
                    vector_lists[subset] = np.array([])
                vector_lists[subset] = np.vstack([vector_lists[subset], model.wv[word]]) if vector_lists[subset].size else model.wv[word]

    for subset1 in subsets1:
        cosine_similarities[subset1] = {}
        for subset2 in subsets2:
            sum1 = np.zeros(len(vector_lists[subset1][0]))
            sum2 = np.zeros(len(vector_lists[subset2][0]))
            for vector in vector_lists[subset1]:
                sum1 = np.add(sum1, vector)
            for vector in vector_lists[subset2]:
                sum2 = np.add(sum2, vector)
            average1 = sum1/len(vector_lists[subset1])
            average2 = sum2/len(vector_lists[subset2])
            cosine_similarities[subset1][subset2] = cosine_similarity([average1], [average2], dense_output=False)[0][0]
    return cosine_similarities

In [19]:
def compute_doc2vec_cosine_similarity(model, texts, subsets1, subsets2):
    cosine_similarities = {}
    data = {}
    for subset in texts:
        data[subset] = []
        for sentence in texts[subset]:
            data[subset] += word_tokenize(sentence)
    for subset1 in subsets1:
        cosine_similarities[subset1] = {}
        inferred_vector = model.infer_vector(data[subset1])
        most_similar_docs = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
        # for similarity in model.docvecs.most_similar([model.docvecs[subset1]], topn=len(model.docvecs)):
        #     subset2 = similarity[0]
        #     cosine_similarity = similarity[1]
        #     cosine_similarities[subset1][subset2] = cosine_similarity
        for subset2 in subsets2:
            # docvec1 = model.docvecs[subset1]
            # docvec2 = model.docvecs[subset2]
            # cosine_similarities[subset1][subset2] = cosine_similarity([docvec1], [docvec2])[0][0]
            cosine_similarities[subset1][subset2] = next(doc[1] for doc in most_similar_docs if doc[0] == subset2)
    return cosine_similarities

In [20]:
def write_sentiwordnet_scores(collection, senti_scores):
    sentiwordnet_scores_directory = './results/sentiwordnet_scores/'
    if not os.path.exists(sentiwordnet_scores_directory):
        os.mkdir(sentiwordnet_scores_directory)
    df = pd.DataFrame.from_dict(senti_scores, orient='index')
    df.to_csv(sentiwordnet_scores_directory + collection + '_sentiwordnet_scores.csv')

In [21]:
def write_mpqa_scores(collection, mpqa_scores):
    mpqa_scores_directory = './results/mpqa_scores/'
    if not os.path.exists(mpqa_scores_directory):
        os.mkdir(mpqa_scores_directory)
    df = pd.DataFrame.from_dict(mpqa_scores, orient='index')
    df.to_csv(mpqa_scores_directory + collection + '_mpqa_scores.csv')

In [22]:
def write_socialsent_scores(collection, socialsent_scores):
    socialsent_scores_directory = './results/socialsent_scores/'
    if not os.path.exists(socialsent_scores_directory):
        os.mkdir(socialsent_scores_directory)
    df = pd.DataFrame.from_dict(socialsent_scores, orient='index')
    df.to_csv(socialsent_scores_directory + collection + '_socialsent_scores.csv')

In [23]:
def write_similarity_measures(collection, similarities, similarity_type, filename_postfix, model_type='word2vec',):
    similarity_directory = './results/'+ similarity_type + '/'
    if not os.path.exists(similarity_directory):
        os.mkdir(similarity_directory)
    df = pd.DataFrame.from_dict(similarities, orient='index')
    if model_type == 'word2vec':
        df.to_csv(similarity_directory + collection + '_word2vec_' + filename_postfix + '.csv')
    else:
        df.to_csv(similarity_directory + collection + '_doc2vec_' + filename_postfix + '.csv')

In [24]:
def write_unigrams_sentiwordnet_scores(collection, senti_scores):
    sentiwordnet_scores_directory = './results/sentiwordnet_scores/'
    if not os.path.exists(sentiwordnet_scores_directory):
        os.mkdir(sentiwordnet_scores_directory)
    index = [senti_scores[next(iter(senti_scores))][next(iter(senti_scores[next(iter(senti_scores))]))].keys(), list(senti_scores.keys())]
    data = []
    for subset1 in senti_scores:
        data_slice = []
        for key in senti_scores[subset1][next(iter(senti_scores[subset1].keys()))]:
            for subset2 in senti_scores[subset1]:
                data_slice.append(senti_scores[subset1][subset2][key])
        data.append(data_slice)
    mux = pd.MultiIndex.from_product(index)
    df = pd.DataFrame(data, index=list(senti_scores.keys()), columns=mux)
    df.to_csv(sentiwordnet_scores_directory + collection + '_unigrams_sentiwordnet_scores.csv')

In [25]:
def write_unigrams_basic_analysis(collection, common_words):
    unigrams_scores_directory = './results/unigrams/'
    if not os.path.exists(unigrams_scores_directory):
        os.mkdir(unigrams_scores_directory)
    df = pd.DataFrame.from_dict(common_words, orient='index')
    df.to_csv(unigrams_scores_directory + collection + '_unigrams_scores.csv')

In [26]:
i2b2_directory = './data/i2b2/smokers'
reuters_directory = './data/reuters/processed_data'
reddit_directory = './data/reddit/processed_data'

i2b2_raw_directory = './data/i2b2/smokers_raw'
reuters_raw_directory = './data/reuters/raw_data'
reddit_raw_directory = './data/reddit/raw_data'

results_directory = './results'
if not os.path.exists(results_directory):
    os.mkdir(results_directory)

In [27]:
i2b2_word2vec_texts, i2b2_doc2vec_texts = read_files(i2b2_directory)
reuters_word2vec_texts, reuters_doc2vec_texts = read_files(reuters_directory)
reddit_word2vec_texts, reddit_doc2vec_texts = read_files(reddit_directory)

In [28]:
i2b2_raw_texts = read_raw_files(i2b2_raw_directory)
reuters_raw_texts = read_raw_files(reuters_raw_directory)
reddit_raw_texts = read_raw_files(reddit_raw_directory)

In [29]:
i2b2_mpqa_scores = mpqa_scores(i2b2_word2vec_texts)
reuters_mpqa_scores = mpqa_scores(reuters_word2vec_texts)
reddit_mpqa_scores = mpqa_scores(reddit_word2vec_texts)

In [30]:
write_mpqa_scores('i2b2', i2b2_mpqa_scores)
write_mpqa_scores('reuters', reuters_mpqa_scores)
write_mpqa_scores('reddit', reddit_mpqa_scores)

In [31]:
i2b2_socialsent_scores = socialsent_scores(i2b2_raw_texts, 'smoke')
reuters_socialsent_scores = socialsent_scores(reuters_raw_texts, 'science')
reddit_socialsent_scores = socialsent_scores(reddit_raw_texts, 'smoke')

In [32]:
write_socialsent_scores('i2b2', i2b2_socialsent_scores)
write_socialsent_scores('reuters', reuters_socialsent_scores)
write_socialsent_scores('reddit', reddit_socialsent_scores)

In [33]:
i2b2_senti_scores, i2b2_tagged_vocabulary = sentiwordnet_scores(i2b2_word2vec_texts)
reuters_senti_scores, reuters_tagged_vocabulary = sentiwordnet_scores(reuters_word2vec_texts)
reddit_senti_scores, reddit_tagged_vocabulary = sentiwordnet_scores(reddit_word2vec_texts)

In [34]:
write_sentiwordnet_scores('i2b2', i2b2_senti_scores)
write_sentiwordnet_scores('reuters', reuters_senti_scores)
write_sentiwordnet_scores('reddit', reddit_senti_scores)

In [35]:
i2b2_unigrams = build_unigram_model(i2b2_word2vec_texts)
reuters_unigrams = build_unigram_model(reuters_word2vec_texts)
reddit_unigrams = build_unigram_model(reddit_word2vec_texts)

In [36]:
i2b2_common_words, i2b2_common_tagged_words, i2b2_unigrams_senti_scores = evaluate_unigram_model(i2b2_unigrams, i2b2_tagged_vocabulary)
reuters_common_words, reuters_common_tagged_words, reuters_unigrams_senti_scores = evaluate_unigram_model(reuters_unigrams, reuters_tagged_vocabulary)
reddit_common_words, reddit_common_tagged_words, reddit_unigrams_senti_scores = evaluate_unigram_model(reddit_unigrams, reddit_tagged_vocabulary)

In [37]:
print(i2b2_common_tagged_words['current smoker']['non-smoker'])

[[('patient', 'NN'), ('history', 'NN'), ('discharge', 'NN'), ('day', 'NN'), ('patient', 'JJ'), ('admission', 'NN'), ('mg', 'NN'), ('pain', 'NN'), ('left', 'VBD'), ('normal', 'JJ'), ('report', 'NN'), ('chest', 'NN'), ('po', 'NN'), ('blood', 'NN'), ('date', 'NN'), ('hospital', 'NN'), ('time', 'NN'), ('medication', 'NN'), ('status', 'NN'), ('year', 'NN'), ('diagnosis', 'NN'), ('showed', 'VBD'), ('revealed', 'VBD'), ('examination', 'NN'), ('disease', 'NN'), ('well', 'RB'), ('course', 'NN'), ('medical', 'JJ'), ('pulmonary', 'JJ'), ('pressure', 'NN'), ('also', 'RB'), ('right', 'JJ'), ('postoperative', 'JJ'), ('past', 'JJ'), ('negative', 'JJ'), ('procedure', 'NN'), ('lung', 'NN'), ('week', 'NN'), ('allergy', 'NN'), ('physical', 'JJ'), ('signed', 'VBN'), ('room', 'NN'), ('present', 'JJ'), ('rate', 'NN'), ('condition', 'NN'), ('cardiac', 'JJ'), ('end', 'NN'), ('post', 'NN'), ('dis', 'NN'), ('artery', 'NN'), ('stable', 'JJ'), ('heart', 'NN'), ('home', 'NN'), ('principal', 'JJ'), ('summary', 'NN'

In [38]:
from copy import deepcopy
i2b2_num_of_common_words = deepcopy(i2b2_common_words)
reuters_num_of_common_words = deepcopy(reuters_common_words)
reddit_num_of_common_words = deepcopy(reddit_common_words)
for dataset in [i2b2_num_of_common_words, reuters_num_of_common_words, reddit_num_of_common_words]:
    for subset1 in dataset:
        for subset2 in dataset[subset1]:
            dataset[subset1][subset2] = len(dataset[subset1][subset2][0])

In [39]:
write_unigrams_basic_analysis('i2b2', i2b2_num_of_common_words)
write_unigrams_basic_analysis('reuters', reuters_num_of_common_words)
write_unigrams_basic_analysis('reddit', reddit_num_of_common_words)

In [40]:
write_unigrams_sentiwordnet_scores('i2b2', i2b2_unigrams_senti_scores)
write_unigrams_sentiwordnet_scores('reuters', reuters_unigrams_senti_scores)
write_unigrams_sentiwordnet_scores('reddit', reddit_unigrams_senti_scores)

In [41]:
i2b2_word2vec_model, i2b2_vocabulary = build_word2vec_model_and_vocabulary(i2b2_word2vec_texts)
reuters_word2vec_model, reuters_vocabulary = build_word2vec_model_and_vocabulary(reuters_word2vec_texts)
reddit_word2vec_model, reddit_vocabulary = build_word2vec_model_and_vocabulary(reddit_word2vec_texts)

In [42]:
i2b2_doc2vec_model = build_doc2vec_model(i2b2_doc2vec_texts)
reuters_doc2vec_model = build_doc2vec_model(reuters_doc2vec_texts)
reddit_doc2vec_model = build_doc2vec_model(reddit_doc2vec_texts)

In [43]:
i2b2_word2vec_cosine_similarities = compute_word2vec_cosine_similarity(i2b2_word2vec_model, i2b2_vocabulary, list(i2b2_word2vec_texts.keys()), list(i2b2_word2vec_texts.keys()))
reuters_word2vec_cosine_similarities = compute_word2vec_cosine_similarity(reuters_word2vec_model, reuters_vocabulary, list(reuters_word2vec_texts.keys()), list(reuters_word2vec_texts.keys()))
reddit_word2vec_cosine_similarities = compute_word2vec_cosine_similarity(reddit_word2vec_model, reddit_vocabulary, list(reddit_word2vec_texts.keys()), list(reddit_word2vec_texts.keys()))

In [44]:
i2b2_doc2vec_cosine_similarities = compute_doc2vec_cosine_similarity(i2b2_doc2vec_model, i2b2_doc2vec_texts, list(i2b2_doc2vec_texts.keys()), list(i2b2_doc2vec_texts.keys()))
reuters_doc2vec_cosine_similarities = compute_doc2vec_cosine_similarity(reuters_doc2vec_model, reuters_doc2vec_texts, list(reuters_doc2vec_texts.keys()), list(reuters_doc2vec_texts.keys()))
reddit_doc2vec_cosine_similarities = compute_doc2vec_cosine_similarity(reddit_doc2vec_model, reddit_doc2vec_texts, list(reddit_doc2vec_texts.keys()), list(reddit_doc2vec_texts.keys()))

In [45]:
i2b2_word2vec_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(i2b2_word2vec_model, i2b2_word2vec_texts, list(i2b2_word2vec_texts.keys()), list(i2b2_word2vec_texts.keys()))
reuters_word2vec_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(reuters_word2vec_model, reuters_word2vec_texts, list(reuters_word2vec_texts.keys()), list(reuters_word2vec_texts.keys()))
reddit_word2vec_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(reddit_word2vec_model, reddit_word2vec_texts, list(reddit_word2vec_texts.keys()), list(reddit_word2vec_texts.keys()))

In [46]:
i2b2_doc2vec_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(i2b2_doc2vec_model, i2b2_doc2vec_texts, list(i2b2_doc2vec_texts.keys()), list(i2b2_doc2vec_texts.keys()))
reuters_doc2vec_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(reuters_doc2vec_model, reuters_doc2vec_texts, list(reuters_doc2vec_texts.keys()), list(reuters_doc2vec_texts.keys()))
reddit_doc2vec_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(reddit_doc2vec_model, reddit_doc2vec_texts, list(reddit_doc2vec_texts.keys()), list(reddit_doc2vec_texts.keys()))

In [47]:
write_similarity_measures('i2b2', i2b2_word2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity')
write_similarity_measures('reuters', reuters_word2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity')
write_similarity_measures('reddit', reddit_word2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity')

In [48]:
write_similarity_measures('i2b2', i2b2_doc2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity', 'doc2vec')
write_similarity_measures('reuters', reuters_doc2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity', 'doc2vec')
write_similarity_measures('reddit', reddit_doc2vec_cosine_similarities, 'cosine_similarity', 'cosine_similarity', 'doc2vec')

In [49]:
write_similarity_measures('i2b2', i2b2_word2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity')
write_similarity_measures('reuters', reuters_word2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity')
write_similarity_measures('reddit', reddit_word2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity')

In [50]:
write_similarity_measures('i2b2', i2b2_doc2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity', 'doc2vec')
write_similarity_measures('reuters', reuters_doc2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity', 'doc2vec')
write_similarity_measures('reddit', reddit_doc2vec_soft_cosine_similarities, 'soft_cosine_similarity', 'soft_cosine_similarity', 'doc2vec')

In [51]:
import random
random.seed(10)
all_synsets = swn.all_senti_synsets()
pos_list = []
neg_list = []
for synset in all_synsets:
    word = str(synset).replace('<', '').split('.')[0]
    if synset.pos_score() == 1:
        if word not in pos_list:
            pos_list.append(word)
    elif synset.neg_score() == 1:
        if word not in neg_list:
            neg_list.append(word)

pos_list_words = random.sample(pos_list, 10)
neg_list_words = random.sample(neg_list, 10)

In [52]:
for texts in [i2b2_word2vec_texts, reuters_word2vec_texts, reddit_word2vec_texts]:
    texts['pos_list'] = pos_list_words
    texts['neg_list'] = neg_list_words

for texts in [i2b2_doc2vec_texts, reuters_doc2vec_texts, reddit_doc2vec_texts]:
    texts['pos_list'] = ' '.join(pos_list_words)
    texts['neg_list'] = ' '.join(neg_list_words)

for vocabulary in [i2b2_vocabulary, reuters_vocabulary, reddit_vocabulary]:
    vocabulary['pos_list'] = pos_list_words
    vocabulary['neg_list'] = neg_list_words

for model in [i2b2_word2vec_model, reuters_word2vec_model, reddit_word2vec_model]:
    model.train(pos_list_words, total_words=10, epochs=40)
    model.train(neg_list_words, total_words=10, epochs=40)

In [53]:
for model in [i2b2_doc2vec_model, reuters_doc2vec_model, reddit_doc2vec_model]:
    model.train([TaggedDocument(words=pos_list_words, tags=['pos_list'])], total_examples=1, epochs=40)
    model.train([TaggedDocument(words=neg_list_words, tags=['neg_list'])], total_examples=1, epochs=40)

In [54]:
print(pos_list_words)
print(neg_list_words)

['balmy', 'happiness', 'excellent', 'admirability', 'unsurpassable', 'good', 'bliss', 'praise', 'homologic', 'estimable']
['dominated', 'abduction', 'scut_work', 'scrimy', 'unfortunate', 'deplorable', 'cad', 'mislead', 'disrespect', 'worst']


In [55]:
i2b2_word2vec_modified_cosine_similarities = compute_word2vec_cosine_similarity(i2b2_word2vec_model, i2b2_vocabulary, ['pos_list', 'neg_list'], list(i2b2_word2vec_texts.keys())[:-2])
reuters_word2vec_modified_cosine_similarities = compute_word2vec_cosine_similarity(reuters_word2vec_model, reuters_vocabulary, ['pos_list', 'neg_list'], list(reuters_word2vec_texts.keys())[:-2])
reddit_word2vec_modified_cosine_similarities = compute_word2vec_cosine_similarity(reddit_word2vec_model, reddit_vocabulary, ['pos_list', 'neg_list'], list(reddit_word2vec_texts.keys())[:-2])

i2b2_doc2vec_modified_cosine_similarities = compute_doc2vec_cosine_similarity(i2b2_doc2vec_model, i2b2_doc2vec_texts, ['pos_list', 'neg_list'], list(i2b2_doc2vec_texts.keys())[:-2])
reuters_doc2vec_modified_cosine_similarities = compute_doc2vec_cosine_similarity(reuters_doc2vec_model, reuters_doc2vec_texts, ['pos_list', 'neg_list'], list(reuters_doc2vec_texts.keys())[:-2])
reddit_doc2vec_modified_cosine_similarities = compute_doc2vec_cosine_similarity(reddit_doc2vec_model, reddit_doc2vec_texts, ['pos_list', 'neg_list'], list(reddit_doc2vec_texts.keys())[:-2])

i2b2_word2vec_modified_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(i2b2_word2vec_model, i2b2_word2vec_texts, ['pos_list', 'neg_list'], list(i2b2_word2vec_texts.keys())[:-2])
reuters_word2vec_modified_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(reuters_word2vec_model, reuters_word2vec_texts, ['pos_list', 'neg_list'], list(reuters_word2vec_texts.keys())[:-2])
reddit_word2vec_modified_soft_cosine_similarities = compute_word2vec_soft_cosine_similarity(reddit_word2vec_model, reddit_word2vec_texts, ['pos_list', 'neg_list'], list(reddit_word2vec_texts.keys())[:-2])

i2b2_doc2vec_modified_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(i2b2_doc2vec_model, i2b2_doc2vec_texts, ['pos_list', 'neg_list'], list(i2b2_doc2vec_texts.keys())[:-2])
reuters_doc2vec_modified_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(reuters_doc2vec_model, reuters_doc2vec_texts, ['pos_list', 'neg_list'], list(reuters_doc2vec_texts.keys())[:-2])
reddit_doc2vec_modified_soft_cosine_similarities = compute_doc2vec_soft_cosine_similarity(reddit_doc2vec_model, reddit_doc2vec_texts, ['pos_list', 'neg_list'], list(reddit_doc2vec_texts.keys())[:-2])

In [56]:
write_similarity_measures('i2b2', i2b2_word2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity')
write_similarity_measures('reuters', reuters_word2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity')
write_similarity_measures('reddit', reddit_word2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity')

write_similarity_measures('i2b2', i2b2_doc2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity', 'doc2vec')
write_similarity_measures('reuters', reuters_doc2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity', 'doc2vec')
write_similarity_measures('reddit', reddit_doc2vec_modified_cosine_similarities, 'cosine_similarity', 'pos_neg_list_cosine_similarity', 'doc2vec')

write_similarity_measures('i2b2', i2b2_word2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity')
write_similarity_measures('reuters', reuters_word2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity')
write_similarity_measures('reddit', reddit_word2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity')

write_similarity_measures('i2b2', i2b2_doc2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity', 'doc2vec')
write_similarity_measures('reuters', reuters_doc2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity', 'doc2vec')
write_similarity_measures('reddit', reddit_doc2vec_modified_soft_cosine_similarities, 'soft_cosine_similarity', 'pos_neg_list_soft_cosine_similarity', 'doc2vec')

In [57]:
print(i2b2_word2vec_texts)

{'current smoker': [['eh'], ['atypical', 'chest', 'pain'], ['dis'], ['admission', 'date'], ['report', 'status'], ['discharge', 'date'], ['discharge', 'order'], ['maagtlandbelb', 'cioleboycena'], ['room'], ['service'], ['car'], ['discharge', 'patient'], ['memorial', 'day', 'pm'], ['contingent', 'upon'], ['applicable'], ['order', 'used', 'summary'], ['yes'], ['attending'], ['huyse', 'glaydsry'], ['code', 'status'], ['full', 'code'], ['disposition'], ['home'], ['discharge', 'medication'], ['ecasa', 'aspirin', 'enteric', 'coated', 'mg', 'po', 'qd', 'lisinopril', 'mg', 'po', 'qd'], ['override', 'notice'], ['override', 'added', 'weertsbreunkays', 'lie', 'shirlh', 'order', 'kcl', 'immediate', 'release', 'po', 'ref'], ['potentially', 'serious', 'interaction'], ['lisinopril', 'potassium', 'chloride'], ['reason', 'override'], ['aware', 'metformin', 'mg', 'po', 'bid', 'prilosec', 'omeprazole', 'mg', 'po', 'bid', 'atenolol', 'mg', 'po', 'qd', 'pravachol', 'pravastatin', 'mg', 'po', 'qhs'], ['food'