In [1]:
import os
import re
import nltk
import pickle
import fnmatch
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from prettytable import PrettyTable

path_to_polarity_dictionary = '/home/fraga/Escritorio/ML-SentiCon/senticon.es.xml'
path_to_reviews = '/home/fraga/Escritorio/corpusCine/corpusCriticasCine/'
path_to_ranks = '/home/fraga/Escritorio/Machine-Learning/Algorithms/Ordinal Logistic Regression/yfile'

ranksfile = open(path_to_ranks, 'rb')
ranks = pickle.load(ranksfile)
ranksfile.close()

In [2]:
pos_format = ".review.pos"

# How many .pos files are contained?
amount_of_pos = len(fnmatch.filter(os.listdir(path_to_reviews), '*.pos'))

local_voc = []
i = 1

while(i < amount_of_pos):
    try:
        # Building the path to the file
        a = open(path_to_reviews + str(i) + pos_format, "r", encoding = "ISO-8859-1")

        # Getting the content and forming a single string with it
        content = a.readlines()
        
        aux = []
        
        # Getting the lemmatized word of each line (this word is in the second position of each line)
        for j in range(len(content)):
            try:
                aux.append(content[j].split(' ')[1])
            except:
                continue
        
        local_voc.append(aux)
        i += 1
        a.close()
        
    except:
        i += 1

print("Vocabulary length:", len(local_voc))

Vocabulary length: 3878


In [3]:
def removeStopwords(vocabulary):
    T = re.compile(r'^[-+]?([1-9]\d*|0)$')
    O = re.compile('[a-z]')
    punctation_marks = ('.', ',', ':', ';',
                        '¿', '?', '!', '¡',
                        '(', ')', '[', ']',
                        '{', '}', '"', '""',
                        "'", "''")
    return [word for word in vocabulary
            if word not in stopwords.words('spanish') and
            word not in punctation_marks
            and T.match(word) == None
            and O.match(word) != None]

In [4]:
clean_voc = []

for i in range(len(local_voc)):
    clean_voc.append(removeStopwords(local_voc[i]))

In [5]:
vocfile = open('clean vocabulary', 'ab') 
pickle.dump(clean_voc, vocfile)
vocfile.close()

In [6]:
# Getting the polarity values
a = open(path_to_polarity_dictionary)
content = a.read()
a.close()

soup = BeautifulSoup(content, 'lxml')

lemmas_lines = soup.findAll('lemma')
print("There are", len(lemmas_lines), "lemmas values")

polarity_dictionary = {}

for i in range(len(lemmas_lines)):
    polarity_dictionary[lemmas_lines[i].text[1:-1]] = float(lemmas_lines[i]['pol'])

keys = list(polarity_dictionary.keys())

There are 11542 lemmas values


In [7]:
lemmafile = open('polarity values', 'ab') 
pickle.dump(polarity_dictionary, lemmafile)
lemmafile.close()

In [8]:
# Getting the position of the ranks
ranks_one   = np.asarray(np.where(ranks == 1))
ranks_one   = ranks_one.reshape((ranks_one.shape[1], ))

ranks_two   = np.asarray(np.where(ranks == 2))
ranks_two   = ranks_two.reshape((ranks_two.shape[1], ))

ranks_three = np.asarray(np.where(ranks == 3))
ranks_three = ranks_three.reshape((ranks_three.shape[1], ))

ranks_four  = np.asarray(np.where(ranks == 4))
ranks_four  = ranks_four.reshape((ranks_four.shape[1], ))

ranks_five  = np.asarray(np.where(ranks == 5))
ranks_five  = ranks_five.reshape((ranks_five.shape[1], ))

In [9]:
# Getting the values of the rank zero reviews
sum_one = 0
amount_one = len(ranks_one)

voc_one = []
for i in range(len(ranks_one)):
    for j in range(len(clean_voc[ranks_one[i]])):
        if(clean_voc[ranks_one[i]][j] not in voc_one):
            voc_one.append(clean_voc[ranks_one[i]][j])

for word in voc_one:
    if word in keys:
        sum_one += polarity_dictionary[word]

value_rank_one = sum_one / amount_one

In [10]:
# Getting the values of the rank zero reviews
sum_two = 0
amount_two = len(ranks_two)

voc_two = []
for i in range(len(ranks_two)):
    for j in range(len(clean_voc[ranks_two[i]])):
        if(clean_voc[ranks_two[i]][j] not in voc_two):
            voc_two.append(clean_voc[ranks_two[i]][j])

for word in voc_two:
    if word in keys:
        sum_two += polarity_dictionary[word]

value_rank_two = sum_two / amount_two

In [11]:
# Getting the values of the rank zero reviews
sum_three = 0
amount_three = len(ranks_three)

voc_three = []
for i in range(len(ranks_three)):
    for j in range(len(clean_voc[ranks_three[i]])):
        if(clean_voc[ranks_three[i]][j] not in voc_three):
            voc_three.append(clean_voc[ranks_three[i]][j])

for word in voc_three:
    if word in keys:
        sum_three += polarity_dictionary[word]

value_rank_three = sum_three / amount_three

In [12]:
# Getting the values of the rank zero reviews
sum_four = 0
amount_four = len(ranks_four)

voc_four = []
for i in range(len(ranks_four)):
    for j in range(len(clean_voc[ranks_four[i]])):
        if(clean_voc[ranks_four[i]][j] not in voc_four):
            voc_four.append(clean_voc[ranks_four[i]][j])

for word in voc_four:
    if word in keys:
        sum_four += polarity_dictionary[word]

value_rank_four = sum_four / amount_four

In [13]:
# Getting the values of the rank zero reviews
sum_five = 0
amount_five = len(ranks_five)

voc_five = []
for i in range(len(ranks_five)):
    for j in range(len(clean_voc[ranks_five[i]])):
        if(clean_voc[ranks_five[i]][j] not in voc_five):
            voc_five.append(clean_voc[ranks_five[i]][j])

for word in voc_five:
    if word in keys:
        sum_five += polarity_dictionary[word]

value_rank_five = sum_five / amount_five

In [14]:
t = PrettyTable(['Rank', 'Value'])
t.add_row(['One', value_rank_one])
t.add_row(['Two', value_rank_two])
t.add_row(['Three', value_rank_three])
t.add_row(['Four', value_rank_four])
t.add_row(['Five', value_rank_five])
print(t)

+-------+---------------------+
|  Rank |        Value        |
+-------+---------------------+
|  One  |  0.1253361823361831 |
|  Two  | 0.08458396533044492 |
| Three | 0.07270470869912267 |
|  Four | 0.08966179775280968 |
|  Five | 0.21144468546637862 |
+-------+---------------------+
