# Códigos Tidene - Similaridades 

Referências:
https://radimrehurek.com/gensim/tutorial.html

In [1]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import gensim
import pickle

### Utilizando o sklearn para ler o vetorizador tfidf (aquele criado no Tidene_Vectorizers) para calcular semelhanças

In [2]:
import csv

class readCorpus(object):
    def __init__(self,csvfile,list_of_fields_to_read=[],tokenizer=None,encoding='utf8'):
        self.csvfile = csvfile
        self.fields = list_of_fields_to_read
        self.tokenizer = tokenizer
        self.encoding = encoding
    
    def __iter__(self):
        f = open(self.csvfile,encoding=self.encoding, errors='ignore')
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) #separador dos campos\n",
        headers = next(reader, None)
        if (len(self.fields) <= 0):
            self.fields = headers
        selected_field_indexes = []
        for idx,field in enumerate(headers):
            if field in self.fields:
                selected_field_indexes.append(idx)

        for line in reader:
            if line:
                yield [line[idx] for idx in selected_field_indexes] if (len(selected_field_indexes)>1) else (line[selected_field_indexes[0]] if not self.tokenizer else tokenizer.tokenize(line[selected_field_indexes[0]]))
                        

In [3]:
# copiei aqui as classes definidas quando foram criados os vetorizadores... ela poderia ser importada do notebook no qual foi definida
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


#### Lê os arquivos já gravados - vetorizador e matriz de representação dos documentos toy.csv


In [4]:
# COMO O VETORIZADOR FOI TREINADO COM O PREPROCESSADOR PERSONALIZADO, EH NECESSARIO QUE ELE TENHA ACESSO A CLASSE PREPROCESS

tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pickle", "rb"))
X_train_tfidf = pickle.load(open("X_train_tfidf.pickle", "rb"))

X_train_tfidf.shape   # matriz de features

(17, 943)

#### Testando o modelo com textos novos

In [5]:
test_docs = readCorpus("testtoy.csv",list_of_fields_to_read=['data'])          # ou pd.read_csv('testtoy.csv',encoding='utf8')['data']     #.values.tolist()
test_classes = pd.read_csv('testtoy.csv',encoding='utf8')['subgroup'].values.tolist()

train_classes = pd.read_csv('toy.csv',encoding='utf8')['subgroup'].values.tolist()

 #representa os documentos com o padrao treinado pelo vetorizador tfidf
X_test_tfidf = tfidf_vectorizer.transform(test_docs)


In [6]:
X_test_tfidf.shape

(8, 943)

#### Calcula similaridade cosseno entre os textos novos (testes) e os de treinamento 

In [7]:
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity

# calcula similaridade entre o primeiro documento e cada um dos documentos da matrix

predicts=cosine_similarity(X_test_tfidf,X_train_tfidf)
print(predicts)

[[0.2068203  0.0711777  0.05750602 0.05738363 0.14375667 0.10602145
  0.07112414 0.12744874 0.12383886 0.07777014 0.02062435 0.03861613
  0.04193319 0.01694143 0.0355781  0.03688384 0.01893789]
 [0.09663616 0.02306838 0.03425474 0.58463809 0.11010998 0.38324208
  0.65953449 0.47873438 0.02590699 0.02482298 0.01001992 0.00882248
  0.02319466 0.01494955 0.01531237 0.00832083 0.01979019]
 [0.09453301 0.10078997 0.01836139 0.07562716 0.07960121 0.05771268
  0.0951439  0.07650608 0.09320655 0.02265438 0.0152423  0.01842366
  0.01283329 0.04642557 0.02327274 0.00546506 0.00712939]
 [0.13198484 0.13759623 0.01996505 0.24123981 0.05561248 0.23976371
  0.25745172 0.19669266 0.11531122 0.1460988  0.02831212 0.06047842
  0.04348693 0.03607556 0.03192598 0.02781585 0.0284788 ]
 [0.0151623  0.02260558 0.01107175 0.02835495 0.01567109 0.01753823
  0.02387818 0.03039803 0.0149737  0.36056705 0.26862345 0.26421248
  0.1771464  0.05959627 0.09912659 0.05379681 0.01791285]
 [0.0103377  0.00794832 0.0196

In [8]:
print("Similaridade Cosseno, entre cada doc novo e o conjunto inteiro")
for i,p in enumerate(predicts):
    print("real: ",test_classes[i])
    for idx,score in sorted(enumerate(predicts[0]),key = lambda x: x[1],reverse=True)[:17]:
        print ("pred: ", train_classes[idx]," Score:",score)
    print("")

Similaridade Cosseno, entre cada doc novo e o conjunto inteiro
real:  B03B00562
pred:  B03B00402  Score: 0.20682030293822073
pred:  B03B00512  Score: 0.14375666667585926
pred:  B03B00562  Score: 0.12744874080129123
pred:  B03B00566  Score: 0.12383885994365144
pred:  B03B00562  Score: 0.1060214483264906
pred:  H03F00126  Score: 0.0777701373315815
pred:  B03B00500  Score: 0.07117769780568488
pred:  B03B00562  Score: 0.07112413895173579
pred:  B03B00546  Score: 0.057506018920709036
pred:  B03B00510  Score: 0.05738363138704032
pred:  H03M00730  Score: 0.041933187698688784
pred:  H03F00102  Score: 0.03861613150985254
pred:  H03M00740  Score: 0.03688384202121075
pred:  H03M00740  Score: 0.035578097924264085
pred:  H03F00130  Score: 0.020624345842700148
pred:  H03M00746  Score: 0.018937890901341205
pred:  H03M00730  Score: 0.016941427973571697

real:  B03B00510
pred:  B03B00402  Score: 0.20682030293822073
pred:  B03B00512  Score: 0.14375666667585926
pred:  B03B00562  Score: 0.1274487408012912

#### alguns artigos para leitura
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
https://www.programcreek.com/python/example/100424/sklearn.metrics.pairwise.cosine_similarity
http://techinpink.com/2017/08/04/implementing-similarity-measures-cosine-similarity-versus-jaccard-similarity/
https://www.kernix.com/blog/similarity-measure-of-textual-documents_p12


#### Lendo a matriz ja montada dos textos (treinamento e teste)

In [9]:
X_train_w2v_glove_wiki_gigaword_100 = pickle.load(open("X_train_w2v_glove_wiki_gigaword_100.pickle", "rb"))
print("Formato matriz treinamento:",X_train_w2v_glove_wiki_gigaword_100.shape)   # matriz de features

X_test_w2v_glove_wiki_gigaword_100 = pickle.load(open("X_test_w2v_glove_wiki_gigaword_100.pickle", "rb"))
print("Formato matriz teste:",X_test_w2v_glove_wiki_gigaword_100.shape)   # matriz de features



Formato matriz treinamento: (17, 100)
Formato matriz teste: (8, 100)


In [30]:
#### Calculando similaridade cosseno com as representações word2vec

In [10]:
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity

# calcula similaridade entre o primeiro documento e cada um dos documentos da matrix

predicts_w2v=cosine_similarity(X_test_w2v_glove_wiki_gigaword_100,X_train_w2v_glove_wiki_gigaword_100)
print(predicts_w2v)

[[0.9674649  0.96386214 0.95017826 0.95223056 0.90954931 0.96330483
  0.9518419  0.95093699 0.95865872 0.89490345 0.89492231 0.91426139
  0.92803791 0.91118234 0.89484721 0.8673771  0.90463559]
 [0.96067921 0.94481012 0.95306651 0.95565021 0.92225828 0.95119399
  0.95158658 0.94787    0.93549561 0.89018459 0.90755116 0.91680554
  0.92667357 0.91669357 0.90520308 0.87760071 0.92567079]
 [0.95297535 0.96128278 0.95065833 0.94092882 0.89779579 0.94720104
  0.93965872 0.94356632 0.95331578 0.87413012 0.88104602 0.90835149
  0.91659791 0.89422095 0.88923665 0.84038941 0.88534181]
 [0.93756292 0.94579747 0.92362952 0.9477134  0.89880528 0.94501417
  0.9445157  0.9337434  0.92952605 0.89058654 0.88768438 0.88685279
  0.90543784 0.90297954 0.89714067 0.87211199 0.89572481]
 [0.93391418 0.91628955 0.91629852 0.93503787 0.92086666 0.93040786
  0.94360253 0.93796038 0.89017803 0.9588876  0.9599715  0.96234023
  0.96338985 0.94796183 0.94326851 0.92500469 0.93332646]
 [0.93356924 0.92570938 0.9295

In [11]:
print("Similaridade Cosseno W2V, entre cada doc novo e o conjunto inteiro")
for i,p in enumerate(predicts_w2v):
    print("real: ",test_classes[i])
    for idx,score in sorted(enumerate(predicts_w2v[0]),key = lambda x: x[1],reverse=True)[:3]:
        print ("pred: ", train_classes[idx]," Score:",score)
    print("")

Similaridade Cosseno W2V, entre cada doc novo e o conjunto inteiro
real:  B03B00562
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  B03B00510
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  B03B00700
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  B03B00104
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  H03F00100
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  H03F00102
pred:  B03B00402  Score: 0.9674648961197683
pred:  B03B00500  Score: 0.9638621406412274
pred:  B03B00562  Score: 0.9633048329743776

real:  H03M00514
pred:  B03B00402