In [5]:
import nltk
import re
import os
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import random
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
conj_stopwords = stopwords.words('spanish')
def removeStopWords(words):
    return [ word for word in words if word not in conj_stopwords ]

In [7]:
def clearTokens(tokens):
    result = []
    for token in tokens:
        clearToken = ""
        for c in token:
            if re.match(r'[a-záéíóúñüA-ZÁÉÍÓÚÑÜ]', c):
                clearToken += c
        if len(clearToken) > 0:
            result.append(clearToken.lower())
    return result

In [8]:
def getReviews(folder):
    reviews = dict()
    cnt = 0
    for fileName in os.listdir(folder):
        title = fileName.split('.')[0]
        if title not in reviews:
            reviews[title] = {'id': cnt, 'rank': 0, 'text': []}
            cnt += 1
        ext = fileName.split('.')[-1]
        with open(os.path.join(folder, fileName)) as f:
            if ext == 'pos':
                text = []
                while True:
                    line = f.readline()
                    if not line:
                        break
                    line = line.rstrip('\n').split(' ')
                    if len(line) < 2:
                        continue
                    text.append(line[1])
                text = removeStopWords(clearTokens(text))
                reviews[title]['text'] = ' '.join(text)
            elif ext == 'xml':
                rank = int(f.readline().split('rank="')[1][0])
                reviews[title]['rank'] = rank
    return list(reviews.values())

In [9]:
def getFeatureMatrix(reviews):
    vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.2, use_idf=True, ngram_range=(1,3))
    X = vectorizer.fit_transform([review['text'] for review in reviews])
    return vectorizer, X

In [10]:
def k_means(feature_matrix, num_clusters, num_iter):
    km = KMeans(n_clusters=num_clusters, max_iter=num_iter, init='k-means++', n_init=1)
    km.fit(feature_matrix)
    return km

In [11]:
reviews = getReviews('corpusCriticasCine')

In [12]:
vectorizer, feature_matrix = getFeatureMatrix(reviews)

In [13]:
km_obj = k_means(feature_matrix, 5, 1000)

In [14]:
clusters = km_obj.labels_.tolist()
labels = [review['rank']-1 for review in reviews]
terms = vectorizer.get_feature_names()

In [15]:
real_groups = [[] for i in range(5)]
for review in reviews:
    real_groups[review['rank']-1].append(review['id'])

In [16]:
k_groups = [[] for i in range(5)]
for i in range(len(clusters)):
    k_groups[km_obj.labels_[i]].append(i)

In [39]:
info = [[0 for j in range(5)] for i in range(5)]
for i in range(5):
    for j in range(5):
        info[i][j] = len(set(real_groups[i]) & set(k_groups[j]))

In [40]:
for id in k_groups[0]:
    print(reviews[id]['rank']-1)

3
1
3
2
3
3
1
1
2
3
2
2
1
1
4
2
4
3
4
4
4
3
2
1
4
1
4
0
3
3
2
1
2
2
0
2
2
4
2
3
1
3
3
2
1
1
3
1
2
1
3
2
2
2
2
0
1
0
1
2
2
1
2
2
3
2
1
3
4
2
0
2
1
3
2
1
2
2
2
0
3
4
4
1
3
4
1
3
2
2
2
3
1
3
2
1
4
3
0
3
0
1
2
3
1
4
3
4
4
4
2
2
2
3
2
2
2
2
2
3
3
4
4
3
3
3
1
3
3
4
1
2
2
3
3
1
1
0
0
4
3
1
3
1
3
4
2
1
3
1
2
1
4
2
1
4
1
4
4
4
1
3
1
2
4
2
2
1
2
2
2
3
2
3
0
1
3
1
2
4
4
2
4
1
1
1
1
4
3
0
3
2
2
3
2
2
2
4
4
3
4
1
2
2
3
4
2
1
1
2
2
3
3
1
4
3
1
1
2
4
1
4
3
3
4
2
2
2
3
2
1
4
2
4
3
1
4
1
1
2
3
4
3
2
2
2
3
4
1
3
3
1
2
4
4
1
2
3
4
4
2
1
1
4
1
1
0
3
2
2
3
3
2
1
2
0
2
1
2
4
3
1
2
3
3
0
2
3
3
3
3
3
1
3
3
0
2
1
4
0
3
1
3
3
4
2
2
2
3
2
1
3
1
1
2
3
1
2
4
3
2
2
1
3
1
4
4
4
0
3
0
3
2
3
2
1
2
1
3
2
4
2
1
2
2
3
3
2
2
0
3
1
3
0
1
2
2
1
1
3
3
2
1
1
1
3
4
1
1
4
1
1
3
2
0
1
3
1
3
2
2
3
3
3
3
3
4
4
0
2
3
2
1
2
3
2
2
2
2
3
3
3
2
3
3
1
1
3
3
2
3
0
3
2
0
4
1
2
0
3
1
2
1
3
2
0
4
1
2
3
2
3
1
3
2
2
1
0
2
2
2
2
2
4
1
2
3
2
3
4
0
2
4
3
0
2
1
2
2
1
2
1
0
1
2
1
0
2
2
2
0
0
2
2
2
3
1
3
2
1
3
3
4
2
3
3
3
2
1
3
2
1
1
1
2
2
3
1
3
0


In [41]:
info

[[119, 24, 40, 124, 44],
 [338, 46, 119, 304, 116],
 [437, 88, 199, 354, 175],
 [378, 41, 148, 219, 104],
 [183, 16, 79, 110, 73]]

In [42]:
metrics.homogeneity_score(labels, clusters)

0.004788018258848739

In [43]:
metrics.completeness_score(labels, clusters)

0.0050413433392722225

In [46]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["Rank/Clouster","A", "B", "C","D","E"]
for i,line in enumerate(info):
    line.insert(0,i+1)
    x.add_row(line)


In [47]:
print(x)

+---------------+-----+----+-----+-----+-----+
| Rank/Clouster |  A  | B  |  C  |  D  |  E  |
+---------------+-----+----+-----+-----+-----+
|       1       | 119 | 24 |  40 | 124 |  44 |
|       2       | 338 | 46 | 119 | 304 | 116 |
|       3       | 437 | 88 | 199 | 354 | 175 |
|       4       | 378 | 41 | 148 | 219 | 104 |
|       5       | 183 | 16 |  79 | 110 |  73 |
+---------------+-----+----+-----+-----+-----+
