In [415]:
import nltk
from nltk import stem
snowball = stem.snowball.EnglishStemmer()

class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

In [417]:
text ="What can I say about this place. \
The staff of the restaurant is nice and the eggplant is not bad. \
Apart from that, very uninspired food, lack of atmosphere and too expensive. \
I am a staunch vegetarian and was sorely dissapointed with the veggie options on the menu. \
Will be the last time I visit, I recommend others to avoid."

[-0.15625, 1.1607142857142858, 0.4166666666666667, 0.5514705882352942, -0.08928571428571429]

In [418]:
splitter = Splitter()
postagger = POSTagger()

In [419]:
splitted_sentences = splitter.split(text)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

In [1]:
# lexicons = []
# with open("SentiWordNet.txt") as f:
#     for i in f.readlines():
#         lexicons.append(i)
# lexicons = lexicons[27:]

In [129]:
from collections import defaultdict, OrderedDict

def create_dict(text):
    pos_dict = defaultdict(int) 
    neg_dict = defaultdict(int)
    for i in range(len(text)):
        word = text[i].split("\t")[4].split("#")[0]
        if word == '':
            pass
        else:
            pos_dict[word] = float(text[i].split("\t")[2])
            neg_dict[word] = float(text[i].split("\t")[3])     
        
    return pos_dict, neg_dict

In [130]:
pos_dict = create_dict(lexicons)[0]
neg_dict = create_dict(lexicons)[1]

In [443]:
y= {"nicely": 0.25}
dict([(snowball.stem(k), v) for k, v in y.items()])

{u'nice': 0.25}

In [438]:
test_dict = dict([(snowball.stem(k), v) for k, v in pos_dict.items()])


In [444]:
test_dict_B = {snowball.stem(k): v for k, v in pos_dict.items()}


In [448]:
from collections import defaultdict

stem_words = defaultdict(list)

for k, v in pos_dict.items():
    stem_words[snowball.stem(k)].append((k, v))

stem_words['nice']

[(u'nicely', 0.25), (u'niceness', 0.625), (u'nice', 0.0)]

In [449]:
#pos_dict_sorted = OrderedDict(sorted(pos_dict.items(), key=lambda t: t[0]))
# neg_dict_sorted = OrderedDict(sorted(neg_dict.items(), key=lambda t: t[0]))

In [136]:
import json
json.dump(pos_dict, open("dicts/pos_dict", 'w'))
json.dump(neg_dict, open("dicts/neg_dict", 'w'))

# reading
#test = json.load(open('pos_dict'))

In [260]:
#files = [open(path, 'r') for path in ['dicts/pos_dict', 'dicts/neg_dict']]
#dictionaries = [json.load(open(dict_file)) for dict_file in files]
pos_dict = json.load(open('dicts/pos_dict'))
neg_dict = json.load(open('dicts/neg_dict'))
dictionaries = [pos_dict, neg_dict]

# the problem is that I have two values, degree of positive and negative, for each word.
# instead of just having positive and negative as labels.
map(lambda x: x.close(), files)
dictionary = neg_dict
max_key_size = 0

for key in dictionary:
    max_key_size = max(max_key_size, len(key))

In [451]:
import re

def tokenize(word):
    snowball = stem.snowball.EnglishStemmer()
    return snowball.stem(word.lower())

score = []

for sent in pos_tagged_sentences:
    pos_score = []
    neg_score = []
    total = 0 
    
    total += len(sent)
    for token in sent:
        if pos_dict.get(token[0]) == None:
            pass
        else:
            token[2].append(dictionary[token[0]])
            pos_score.append(dictionary[token[0]])
    for token in sent2:
        if neg_dict.get(token[0]) == None:
            pass
        else:
            token[2].append(dictionary[token[0]])
            neg_score.append(dictionary[token[0]])
    sent_score = (sum(pos_score) - sum(neg_score)) / float(total) 
    score.append(sent_score)
    
print score
senti_score = sum(score) / float(len(score))

[-0.15625, 0.0, -0.05, -0.03676470588235294, -0.08928571428571429]


In [452]:
senti_score

-0.06646008403361345