In [2]:
import nltk
from nltk import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [5]:
# load dictionary
f = open('subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff', "r", encoding='utf-8')
dictionary = {}
raw = f.read().split('\n')[:-1]
for line in raw:
    bar = line.split()
    # print(bar)
    word = bar[2][6:]
    # key:word -> value:strength & polarity
    if word not in dictionary:
        if bar[0][5:] == 'weaksubj':
            if bar[5][14:] == 'negative':
                dictionary[word] = -5
            elif bar[5][14:] == 'positive':
                dictionary[word] = 5
        if bar[0][5:] == 'strongsubj':
            if bar[5][14:] == 'negative':
                dictionary[word] = -10
            elif bar[5][14:] == 'positive':
                dictionary[word] = 10

# evidence of whether a word is negated
negation_set = ('no', 'not', "hasn't", "haven't", "doesn't", "don't", "isn't", "wasn't", "weren't", 'never', 'seldom', 'barely', 'rarely', 
                'hardly', 'overly', 'excessively')

In [0]:
def read_data(source, collect, corpus, neg_switch):
    f = open(source, "r", encoding='latin-1')
    raw = f.read().split('\n')[:-1]
    num = len(raw)
    if neg_switch == False:
        for rev in raw:
            words = word_tokenize(rev)
            corpus.append(words)
            for word in words:
                if word not in collect:
                    collect.append(word)
    # words in negation_set will not be considered when forming feature vector.
    # Instead, they are used to negate polarities later
    else:
        for rev in raw:
            words = word_tokenize(rev)
            corpus.append(words)
            for word in words:
                if word not in collect and word not in negation_set:
                    collect.append(word)
    return num # number of examples in current corpus

In [0]:
# pol_switch == True: only consider words from MPQA
# neg switch == True: consider if words are negated
# occ_switch == True: word occurence instead of frequency is used to build feature vectors
def body(pol_switch, neg_switch, occ_switch, dictionary):
    # label of examples
    label = []
    # row vector of BOW
    collect = []
    # tokenized reviews
    corpus = []
    neg_num = read_data('corpus2/rt-polarity.neg', collect, corpus, neg_switch)
    label.extend([-1 for i in range(neg_num)])
    pos_num = read_data('corpus2/rt-polarity.pos', collect, corpus, neg_switch)
    label.extend([1 for i in range(pos_num)])
    
    print(f'length of feature vector: {len(collect)}')
    if pol_switch == True:
        keys = list(dictionary.keys())
        dollect = []
        print(len(keys))
        for word in collect:
            if word in keys:
                dollect.append(word)
        print(f'# of words in MPQR: {len(dollect)}')

    # gather feature vectors
    rev_word_mat = [[0 for i in range(len(collect))] for j in range(len(corpus))]
    if pol_switch == False and neg_switch == False and occ_switch == False:
        for i in range(len(corpus)):
            for word in corpus[i]:
                if word in collect:
                    rev_word_mat[i][collect.index(word)] += 1
    if pol_switch == False and neg_switch == False and occ_switch == True:
        for i in range(len(corpus)):
            for word in corpus[i]:
                if word in collect:
                    rev_word_mat[i][collect.index(word)] = 1
    elif pol_switch == True and neg_switch == False and occ_switch == True:
        keys = list(dictionary.keys())
        for i in range(len(corpus)):
            for word in corpus[i]:
                if word in collect:
                    if word not in keys:
                        rev_word_mat[i][collect.index(word)] = 1
                    else: # words from MPQR, use weights in map
                        rev_word_mat[i][collect.index(word)] = dictionary[word]
    elif pol_switch == False and neg_switch == True and occ_switch == True:
        for i in range(len(corpus)):
            for j in range(len(corpus[i])):
                if corpus[i][j] in collect:
                    word = corpus[i][j]
                    if j>0 and corpus[i][j-1] in negation_set: # current word is negated, so flip polarity 
                        rev_word_mat[i][collect.index(word)] = -1
                    else:
                        rev_word_mat[i][collect.index(word)] = 1
    elif pol_switch == True and neg_switch == True and occ_switch == True:
        keys = list(dictionary.keys())
        for i in range(len(corpus)):
            for j in range(len(corpus[i])):
                if corpus[i][j] in collect:
                    word = corpus[i][j]
                    if j>0 and corpus[i][j-1] in negation_set: # current word is negated, so flip polarity
                        if corpus[i][j] in keys: # words from MPQR, use weights in map
                            rev_word_mat[i][collect.index(word)] = -dictionary[word]
                        else:
                            rev_word_mat[i][collect.index(word)] = -1
                    else:
                        if corpus[i][j] in keys: # words from MPQR, use weights in map
                            rev_word_mat[i][collect.index(word)] = dictionary[word]
                        else:
                            rev_word_mat[i][collect.index(word)] = 1

    mat_scaled = preprocessing.scale(rev_word_mat)
    clf = LogisticRegression(random_state=0, max_iter=250)
    scores = cross_val_score(clf, mat_scaled, label, cv=5)
    print(scores)

In [12]:
body(False, False, False, dictionary)

length of feature vector: 20304
[0.73230192 0.74261603 0.73123827 0.73545966 0.72748593]


In [13]:
body(False, False, True, dictionary)

length of feature vector: 20304
[0.73323957 0.73933427 0.72795497 0.73733583 0.7260788 ]


In [14]:
body(False, True, True, dictionary)

length of feature vector: 20295
[0.72433193 0.74636662 0.72420263 0.73405253 0.73780488]


In [15]:
body(True, False, True, dictionary)

length of feature vector: 20304
6452
# of words in MPQR: 3278
[0.73323957 0.73933427 0.72795497 0.73733583 0.7260788 ]


In [16]:
body(True, True, True, dictionary)

length of feature vector: 20295
6452
# of words in MPQR: 3275
[0.72433193 0.74636662 0.72420263 0.73405253 0.73780488]


In [11]:
neg_count = 0
word_count = 0
MPQR_count = 0
keys = list(dictionary.keys())
corpus = []
f = open('corpus2/rt-polarity.neg', "r", encoding='latin-1')
raw = f.read().split('\n')[:-1]
corpus.extend(raw)
f = open('corpus2/rt-polarity.pos', "r", encoding='latin-1')
raw = f.read().split('\n')[:-1]
corpus.extend(raw)
print(f'# of examples: {len(corpus)}')
for rev in corpus:
    words = word_tokenize(rev)
    word_count += len(words)
    for word in words:
        if word in negation_set:
            neg_count += 1
        if word in keys:
            MPQR_count += 1

print(f'# of negations in corpus: {neg_count}; # of words in corpus: {word_count}; # of words in corpus from MPQR: {MPQR_count}')
print(f'average sentence length: {word_count/len(corpus)} average # of words from MPQR in each sentence: {MPQR_count/len(corpus)}')

# of examples: 10662
# of negations in corpus: 1601; # of words in corpus: 230219; # of words in corpus from MPQR: 29632
average sentence length: 21.59247795910711 average # of words from MPQR in each sentence: 2.7792159069592945
