In [1]:
from nltk import pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import os

In [2]:
def read_files(directory):
    texts = {}
    for dir in os.listdir(directory):
        texts[dir] = []
        for root, _, files in os.walk(os.path.join(directory, dir)):
            for file in files:
                open_file = open(os.path.join(root, file), 'r')
                text = open_file.read().split('\n')
                open_file.close()
                texts[dir] += text
    return texts

In [3]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return ''

In [4]:
def sentiwordnet_scores(texts):
    senti_scores = {}
    for subset in texts:
        senti_scores[subset] = {
            'total_positive_score': 0,
            'total_negative_score': 0,
            'total_objective_score': 0,
            'average_positive_score': 0,
            'average_negative_score': 0,
            'average_objective_score': 0,
            'total_num_words': 0,
            'positive_count': 0,
            'negative_count': 0,
            'objective_count': 0
        }
        for line in texts[subset]:
            tagged = pos_tag(word_tokenize(line))
            for tag in tagged:
                synsets = swn.senti_synsets(tag[0], get_wordnet_pos(tag[1])) if get_wordnet_pos(tag[1]) != '' else []
                synsets_list = list(synsets)
                if len(synsets_list) > 0:
                    senti_scores[subset]['total_num_words'] += 1
                    synset = synsets_list[0]
                    pos_score = synset.pos_score()
                    neg_score = synset.neg_score()
                    obj_score = synset.obj_score()
                    senti_scores[subset]['total_positive_score'] += pos_score
                    senti_scores[subset]['total_negative_score'] += neg_score
                    senti_scores[subset]['total_objective_score'] += obj_score
                    if pos_score > neg_score: 
                        senti_scores[subset]['positive_count'] += 1
                    elif pos_score < neg_score:
                        senti_scores[subset]['negative_count'] += 1
                    else:
                        senti_scores[subset]['objective_count'] += 1
        senti_scores[subset]['average_positive_score'] = senti_scores[subset]['total_positive_score'] / senti_scores[subset]['total_num_words']
        senti_scores[subset]['average_negative_score'] = senti_scores[subset]['total_negative_score'] / senti_scores[subset]['total_num_words']
        senti_scores[subset]['average_objective_score'] = senti_scores[subset]['total_objective_score'] / senti_scores[subset]['total_num_words']
    return senti_scores

In [5]:
def build_word2vec_model_and_vocabulary(texts):
    whole_text = []
    vocabulary = {}
    model = Word2Vec(size=100, window=5, min_count=1, negative=10, workers=4)
    for subset in texts:
        vocabulary[subset] = []
        for line in texts[subset]:
            sentence = word_tokenize(line)
            whole_text.append(sentence)
            for word in sentence:
                if word not in vocabulary[subset]:
                    vocabulary[subset].append(word)
    # model.build_vocab(sentence, update=True if len(model.wv.vocab) > 0 else False)
    # model.train(whole_text, total_words=model.corpus_count, epochs=30)
    model = Word2Vec(whole_text, size=100, window=5, min_count=1, negative=10, workers=4)
    return model, vocabulary

In [6]:
def compute_similarity(model, vocabulary):
    cosine_similarities = {}
    # jaccard_similarities = {}
    vector_lists = {}
    for subset in vocabulary:
        vector_lists[subset] = [model.wv[word] for word in vocabulary[subset]]

    for subset1 in vector_lists:
        cosine_similarities[subset1] = {}
        # jaccard_similarities[subset1] = {}
        for subset2 in vector_lists:
            cosine_similarities[subset1][subset2] = sum(sum(cosine_similarity(vector_lists[subset1], vector_lists[subset2]))) / (len(vocabulary[subset1]) * len(vocabulary[subset2]))
            # jaccard_similarities[subset1][subset2] = jaccard_score(vector_lists[subset1], vector_lists[subset2], average=None)
    return cosine_similarities

In [7]:
def write_sentiwordnet_scores(collection, senti_scores):
    sentiwordnet_scores_directory = './results/sentiwordnet_scores/'
    if not os.path.exists(sentiwordnet_scores_directory):
        os.mkdir(sentiwordnet_scores_directory)
    df = pd.DataFrame.from_dict(senti_scores, orient='index')
    df.to_csv(sentiwordnet_scores_directory + collection + '_sentiwordnet_scores.csv')

In [8]:
def write_similarity_measures(collection, similarities):
    consine_similarity_directory = './results/cosine_similarity/'
    if not os.path.exists(consine_similarity_directory):
        os.mkdir(consine_similarity_directory)
    df = pd.DataFrame.from_dict(similarities, orient='index')
    df.to_csv(consine_similarity_directory + collection + '_cosine_similarity.csv')

In [9]:
i2b2_directory = './data/i2b2/smokers'
reuters_directory = './data/reuters/processed_data'
reddit_directory = './data/reddit/processed_reddit_data.json'

results_directory = './results'
if not os.path.exists(results_directory):
    os.mkdir(results_directory)

In [10]:
i2b2_texts = read_files(i2b2_directory)
reuters_texts = read_files(reuters_directory)

In [11]:
i2b2_senti_scores = sentiwordnet_scores(i2b2_texts)
reuters_senti_scores = sentiwordnet_scores(reuters_texts)

In [12]:
write_sentiwordnet_scores('i2b2', i2b2_senti_scores)
write_sentiwordnet_scores('reuters', reuters_senti_scores)

In [13]:
i2b2_model, i2b2_vocabulary = build_word2vec_model_and_vocabulary(i2b2_texts)
reuters_model, reuters_vocabulary = build_word2vec_model_and_vocabulary(reuters_texts)

In [14]:
i2b2_cosine_similarities = compute_similarity(i2b2_model, i2b2_vocabulary)
reuters_cosine_similarities = compute_similarity(reuters_model, reuters_vocabulary)

In [15]:
write_similarity_measures('i2b2', i2b2_cosine_similarities)
write_similarity_measures('reuters', reuters_cosine_similarities)

In [16]:
print(reuters_cosine_similarities)

{'sci.crypt': {'sci.crypt': 0.8997260484860864, 'sci.electronics': 0.9077978524471398, 'sci.med': 0.9023777317886426, 'sci.space': 0.8907135498967576}, 'sci.electronics': {'sci.crypt': 0.9077978695981004, 'sci.electronics': 0.9168579432246009, 'sci.med': 0.9109681880108809, 'sci.space': 0.8985917812697923}, 'sci.med': {'sci.crypt': 0.9023777506377786, 'sci.electronics': 0.9109681974425523, 'sci.med': 0.9053187351112734, 'sci.space': 0.8932724838899536}, 'sci.space': {'sci.crypt': 0.8907135420077301, 'sci.electronics': 0.8985917496231003, 'sci.med': 0.8932724878596008, 'sci.space': 0.8818624898572911}}


In [17]:
print(i2b2_cosine_similarities)

{'current smoker': {'current smoker': 0.7914213235085851, 'non-smoker': 0.7849269804711855, 'past smoker': 0.8039560093472362, 'smoker': 0.8046947242782889, 'unknown': 0.7373772263799789}, 'non-smoker': {'current smoker': 0.7849269700027701, 'non-smoker': 0.7785348474700405, 'past smoker': 0.7973867372449124, 'smoker': 0.7981133008404323, 'unknown': 0.7313544750768218}, 'past smoker': {'current smoker': 0.8039560206495121, 'non-smoker': 0.7973867445008836, 'past smoker': 0.8167993058761264, 'smoker': 0.817640259767548, 'unknown': 0.749039657832825}, 'smoker': {'current smoker': 0.8046947183185761, 'non-smoker': 0.7981132935480467, 'past smoker': 0.8176402318952525, 'smoker': 0.8188015597781435, 'unknown': 0.7496677159127629}, 'unknown': {'current smoker': 0.7373772390738277, 'non-smoker': 0.7313544899584205, 'past smoker': 0.749039651084836, 'smoker': 0.7496677173900353, 'unknown': 0.687085862376483}}
