# TF-IDF

<p>É necessário instalar o pacote nltk para tokenizar os textos. Para fazer isso, execute por linha de comando:</p>
<ul>
    <li>pip install nltk</li>
    <li>python</li>
    <li>import nltk</li>
    <li>nltk.download()</li>
</ul>

In [1]:
import pandas as pd  # Manipulação de Dados
import nltk  # Processamento de Texto
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np  # Arrays, Matrizes e Funções Matemáticas
from scipy.spatial import distance  # Computação Científica

In [2]:
# Carregando dataset (CSV) com o pandas
apps = pd.read_csv("googleplaystore.csv")
apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## Exemplo da tokenização do primeiro app

In [3]:
word_tokenize(apps.iloc[0]['App'])

['Photo', 'Editor', '&', 'Candy', 'Camera', '&', 'Grid', '&', 'ScrapBook']

# Calculando IDF de cada termo

### Primeiro crio o vocabulário

In [4]:
vocabulario = Counter()
for i in range(len(apps)):
    nome_app = apps.iloc[i]['App'].lower()
    for token in word_tokenize(nome_app):
        vocabulario[token] += 1

In [5]:
vocabulario.most_common()

[('-', 1627),
 (':', 998),
 ('&', 979),
 (',', 890),
 ('for', 562),
 ('free', 515),
 ('app', 333),
 (')', 321),
 ('(', 317),
 ('and', 283),
 ('the', 270),
 ('mobile', 222),
 ('video', 198),
 ('news', 198),
 ('live', 194),
 ('pro', 192),
 ('chat', 184),
 ('photo', 170),
 ('by', 163),
 ('of', 161),
 ('!', 150),
 ('my', 145),
 ('–', 142),
 ('camera', 138),
 ('2018', 138),
 ('dating', 134),
 ('editor', 131),
 ('game', 130),
 ('theme', 129),
 ('3d', 123),
 ("'s", 123),
 ('.', 120),
 ('tv', 120),
 ('games', 118),
 ('learn', 111),
 ('to', 109),
 ('2', 108),
 ('simulator', 107),
 ('with', 106),
 ('calculator', 106),
 ('manager', 103),
 ('tracker', 97),
 ('hd', 96),
 ('online', 93),
 ('launcher', 92),
 ('google', 92),
 ('android', 90),
 ('a', 87),
 ('in', 85),
 ('go', 83),
 ('kids', 81),
 ('wallpapers', 79),
 ('english', 77),
 ('car', 71),
 ('wallpaper', 71),
 ('browser', 70),
 ('maker', 69),
 ('sports', 68),
 ('weather', 68),
 ('number', 65),
 ('keyboard', 65),
 ('text', 64),
 ('new', 61),
 ('

In [6]:
len(vocabulario)

9545

### Agora cálculo do IDF

In [7]:
numero_documentos_termo = Counter()
for i in range(len(apps)):
    nome_app = apps.iloc[i]['App'].lower()
    termos = list(set(word_tokenize(nome_app)))
    for termo in termos:
        numero_documentos_termo[termo] += 1

In [8]:
# Podemos obter o numero de documentos em que cada token aparece (independente do número de vezes)
numero_documentos_termo["soccer"]

28

In [9]:
numero_documentos_termo.most_common()

[('-', 1605),
 (':', 997),
 ('&', 960),
 (',', 559),
 ('for', 551),
 ('free', 504),
 ('app', 327),
 (')', 319),
 ('(', 315),
 ('and', 278),
 ('the', 262),
 ('mobile', 222),
 ('live', 192),
 ('pro', 192),
 ('video', 185),
 ('news', 183),
 ('chat', 169),
 ('photo', 164),
 ('by', 162),
 ('of', 158),
 ('my', 145),
 ('–', 142),
 ('!', 140),
 ('2018', 138),
 ('dating', 134),
 ('game', 130),
 ('camera', 128),
 ('editor', 128),
 ('theme', 128),
 ("'s", 122),
 ('3d', 121),
 ('games', 117),
 ('tv', 111),
 ('learn', 109),
 ('2', 108),
 ('to', 107),
 ('simulator', 107),
 ('with', 106),
 ('manager', 103),
 ('calculator', 100),
 ('.', 97),
 ('hd', 96),
 ('tracker', 94),
 ('online', 93),
 ('google', 92),
 ('android', 90),
 ('launcher', 84),
 ('in', 84),
 ('go', 81),
 ('wallpapers', 79),
 ('a', 79),
 ('kids', 75),
 ('english', 72),
 ('wallpaper', 69),
 ('browser', 68),
 ('sports', 68),
 ('car', 66),
 ('maker', 63),
 ('weather', 63),
 ('text', 61),
 ('new', 61),
 ('keyboard', 60),
 ('i', 60),
 ('number

In [10]:
numero_documentos = len(apps)
print(numero_documentos)

10841


In [11]:
idf = {}
for token in vocabulario.keys():
    idf[token] = np.log(numero_documentos / numero_documentos_termo[token])

In [12]:
idf

{'photo': 4.191224093837094,
 'editor': 4.439060257741675,
 '&': 2.42415723719941,
 'candy': 6.113036691313346,
 'camera': 4.439060257741675,
 'grid': 7.681652609227191,
 'scrapbook': 9.291090521661292,
 'coloring': 5.553420903377924,
 'book': 5.28375733642882,
 'moana': 8.192478232993182,
 'u': 5.857103317176145,
 'launcher': 4.860273722817978,
 'lite': 5.2306475111148725,
 '–': 4.335263464060031,
 'free': 3.0685142535899232,
 'live': 4.03359514963351,
 'cool': 7.345180372605978,
 'themes': 6.113036691313346,
 ',': 2.9649410485061924,
 'hide': 7.0938659443250724,
 'apps': 6.246568083937869,
 'sketch': 8.192478232993182,
 '-': 1.9102114860971755,
 'draw': 6.072214696793091,
 'paint': 6.806183871873292,
 'pixel': 5.794582960194812,
 'number': 5.213553077755573,
 'art': 5.653504361934906,
 'paper': 8.597943341101347,
 'flowers': 7.681652609227191,
 'instructions': 8.597943341101347,
 'smoke': 8.597943341101347,
 'effect': 7.0938659443250724,
 'maker': 5.147955795269759,
 'infinite': 9.29

## IDF é global para o dataset! TF depende de cada documento

# Agora TF-IDF

### primeiro define-se um índice para cada termo

In [13]:
token2int = {}
indice = 0
for token in vocabulario.keys():
    token2int[token] = indice
    indice += 1

### então faz o TF e depois o TF-IDF de cada documento

In [14]:
tfs = {}
for i in range(len(apps)):  # i é o índice da linha na tabela   
    tf = np.zeros((len(vocabulario)))  # Criand um vetor de zeros do tamanho do vocabulário
    tokens_nome = word_tokenize(apps.iloc[i]['App'].lower())  # Tokenizando o nome do app da linha i
    for token in tokens_nome:
        try:
            tf[token2int[token]] += 1
        except KeyError:
            # Caso o token não exista, não faz nada
            # Se nenhum token do documento for reconhecido, ele será representado por um vetor que contém apenas zeros
            pass
    tf = tf / len(tokens_nome)  # Normalizando por meio da divisão pela quantidade de termos no nome do app
    tfs[i] = tf

In [15]:
indice_app = 988  # Mude o índice
print("Nome:", apps.iloc[i]['App'])
print(tfs[indice_app])

Nome: iHoroscope - 2018 Daily Horoscope & Astrology
[0.         0.         0.14285714 ... 0.         0.         0.        ]


In [16]:
tf_idfs = np.zeros((len(apps), len(vocabulario)))
for i in range(len(apps)):
    tokens_nome = word_tokenize(apps.iloc[i]['App'].lower())
    for token in list(set(tokens_nome)):
        tf_idfs[i][token2int[token]] = tfs[i][token2int[token]] * idf[token]  # TF * IDF

In [17]:
tf_idfs[988]

array([0.        , 0.        , 0.34630818, ..., 0.        , 0.        ,
       0.        ])

In [18]:
tfs[0]

array([0.11111111, 0.11111111, 0.33333333, ..., 0.        , 0.        ,
       0.        ])

In [19]:
tf_idfs[0]

array([0.46569157, 0.49322892, 0.80805241, ..., 0.        , 0.        ,
       0.        ])

In [20]:
def documento2tfidf(documento):
    # Transforma um documento textual na sua forma baf-of-words com TF-IDF
    tf = np.zeros((len(vocabulario)))
    tokens = word_tokenize(documento.lower())
    for token in tokens:
        try:
            tf[token2int[token]] += 1
        except KeyError:
            pass
    tf = tf / len(tokens)
    tf_idf = np.zeros(len(vocabulario))
    for token in list(set(tokens)):
        try:
            tf_idf[token2int[token]] = tf[token2int[token]] * idf[token]
        except KeyError:
            pass
    return tf_idf
        
def mais_similares(documento, n=5):
    # Retorna os n documentos mais similares ao passado como parâmetro
    tf_idf_documento = documento2tfidf(documento)
    distancias = {}
    for i in range(len(tf_idfs)):
        # Calcula-se a distância entre o documento e todos os nomes dos apps
        distancias[i] = distance.cosine(tf_idf_documento, tf_idfs[i])  # distância cossenoidal
    distancias = sorted(distancias.items(), key=lambda kv: kv[1])  # Ordenando os índices dos apps conforme a distância
    for tupla in distancias[:n]:  # imprime os n mais próximos
        # tupla = (indice_documento, distancia_cossenoidal)
        print("Distância: ", str(tupla[1]), "\nApp: ", apps.iloc[tupla[0]]['App'], "\n")

In [21]:
mais_similares('Tower Defense')

Distância:  0.509629700339586 
App:  Zombie Defense 

Distância:  0.5221367489624491 
App:  DS Tower Defence 

Distância:  0.5297371192910412 
App:  Galaxy Defense 

Distância:  0.5573500339255162 
App:  Tiny Defense 

Distância:  0.5755472407662653 
App:  Treasure Defense 



In [22]:
mais_similares('Trading Card Game')

Distância:  0.46821441292089416 
App:  WIZARD Card Game 

Distância:  0.5376848524654655 
App:  BJ card game blackjack 

Distância:  0.5740374406692349 
App:  DG Card 

Distância:  0.5758557515921419 
App:  NSE Mobile Trading 

Distância:  0.6383839664420554 
App:  BlackJack -21 Casino Card Game 



In [23]:
mais_similares('RPG Castle')

Distância:  0.4122705967063145 
App:  Castle Clash: RPG War and Strategy FR 

Distância:  0.5118628314988636 
App:  Castle Defense 2 

Distância:  0.5809304869798754 
App:  DB for Hustle Castle 

Distância:  0.5985022616210119 
App:  Castle Defense : Invasion 

Distância:  0.644061338595386 
App:  Hustle Castle: Fantasy Kingdom 



In [24]:
mais_similares('Dolphin')

Distância:  0.3087119463713883 
App:  Cute Dolphin Keyboard 

Distância:  0.4232680740837137 
App:  Dolphin and fish coloring book 

Distância:  0.49509751146249337 
App:  Dolphin Browser - Fast, Private & Adblock🐬 

Distância:  1.0 
App:  Photo Editor & Candy Camera & Grid & ScrapBook 

Distância:  1.0 
App:  Coloring book moana 



In [25]:
mais_similares('asdasd')

  dist = 1.0 - uv / np.sqrt(uu * vv)


Distância:  nan 
App:  Photo Editor & Candy Camera & Grid & ScrapBook 

Distância:  nan 
App:  Coloring book moana 

Distância:  nan 
App:  U Launcher Lite – FREE Live Cool Themes, Hide Apps 

Distância:  nan 
App:  Sketch - Draw & Paint 

Distância:  nan 
App:  Pixel Draw - Number Art Coloring Book 



# Salvando

In [26]:
import pickle

In [27]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [28]:
save_obj(token2int, "token2int")
np.save("tf_idfs.npy", tf_idfs)

# Checando proximidade com nomes de apps escolhidos aleatoriamente

In [29]:
indices = np.array(range(len(apps)))
np.random.shuffle(indices)
for indice in indices[:5]:
    print(apps.iloc[indice]['App'], "\n")
    mais_similares(apps.iloc[indice]['App'].lower())
    print("\n------------------------------------\n")

Digital TV 

Distância:  0.0 
App:  Digital TV 

Distância:  0.5427507089447556 
App:  Mobile TV 

Distância:  0.5506188794816429 
App:  Digital Alarm Clock 

Distância:  0.560171762726075 
App:  Digital Falak 

Distância:  0.5679588940746 
App:  Digital Clock AW-7 


------------------------------------

MyASUS - Service Center 

Distância:  0.0 
App:  MyASUS - Service Center 

Distância:  0.6451397761622895 
App:  My EF Center 

Distância:  0.6606582043526472 
App:  BP Service 

Distância:  0.6810041397180565 
App:  AQ Service 

Distância:  0.7067503682172188 
App:  DV Car Service 


------------------------------------

Archery Physics Objects Destruction Apple shooter 

Distância:  0.0 
App:  Archery Physics Objects Destruction Apple shooter 

Distância:  0.5981787181244416 
App:  Apple Daily Apple News 

Distância:  0.6766531066371648 
App:  Creative Destruction 

Distância:  0.7784323782022134 
App:  R. Physics Puzzle Game 

Distância:  0.7990315905798748 
App:  CK-12 Physics Sim