# Tokens and Embeddings

## Initial Setup

In [1]:
# Aditional installations
# !pip install -U sentence-transformers
# !pip install tf-keras
# !pip install gensim

In [2]:
import os
import torch

import numpy as np
import pandas as pd

import gensim.downloader as api

from typing import Dict, List
from urllib import request

from gensim.models import Word2Vec
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

2025-11-19 14:18:31.367000: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-19 14:18:31.424220: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-19 14:18:32.836145: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [3]:
# Set no proxy environmental variable
os.environ["no_proxy"] = ".experian.eeca,localhost,127.0.0.1,169.254.169.254,api,testserver,internal-brain-lb-platform-dev-1449535370.sa-east-1.elb.amazonaws.com"

## Downloading and Running an LLM

In [4]:
device: str = "cpu"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map=device,
    torch_dtype="auto",
    trust_remote_code=False,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"

In [6]:
# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

In [7]:
# Generate the text
generation_output: torch.Tensor = model.generate(
    input_ids=input_ids,
    max_new_tokens=75
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [8]:
# Print the output.
print(tokenizer.decode(generation_output[0]))

Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|> Subject: Sincere Apologies for the Gardening Mishap


Dear Sarah,


I hope this message finds you well. I am writing to express my deepest apologies for the unfortunate incident that occurred in your garden yesterday.


As you know, I have always admired the beauty and tranquility of


In [9]:
print(input_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001]])


In [10]:
for id in input_ids[0]:
   print(f">>> id: {id};\ttoken: {tokenizer.decode(id)}")

>>> id: 14350;	token: Write
>>> id: 385;	token: an
>>> id: 4876;	token: email
>>> id: 27746;	token: apolog
>>> id: 5281;	token: izing
>>> id: 304;	token: to
>>> id: 19235;	token: Sarah
>>> id: 363;	token: for
>>> id: 278;	token: the
>>> id: 25305;	token: trag
>>> id: 293;	token: ic
>>> id: 16423;	token: garden
>>> id: 292;	token: ing
>>> id: 286;	token: m
>>> id: 728;	token: ish
>>> id: 481;	token: ap
>>> id: 29889;	token: .
>>> id: 12027;	token: Exp
>>> id: 7420;	token: lain
>>> id: 920;	token: how
>>> id: 372;	token: it
>>> id: 9559;	token: happened
>>> id: 29889;	token: .
>>> id: 32001;	token: <|assistant|>


## How Does the Tokenizer Break Down Text?

## Word Versus Subword Versus Character Versus Byte Tokens

## Comparing Trained LLM Tokenizers

In [11]:
colors_list = [
    "102;194;165", 
    "252;141;98", 
    "141;160;203",
    "231;138;195", 
    "166;216;84", 
    "255;217;47"
]

def show_tokens(sentence: str, tokenizer_name: str):
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    
    for idx, t in enumerate(token_ids):
        
        print(
            f"\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m" + tokenizer.decode(t) + "\x1b[0m",
            end=" "
        )

In [12]:
text = """
English and CAPITALIZATION
ðŸŽµ é¸Ÿ
show_tokens False None elif == >= else: two tabs:"      " Three tabs: "         "
12.0*50=600
"""

print(text)


English and CAPITALIZATION
ðŸŽµ é¸Ÿ
show_tokens False None elif == >= else: two tabs:"      " Three tabs: "         "
12.0*50=600



### BERT Base Model (uncased, 2018)

In [13]:
show_tokens(text, "bert-base-uncased")

[0;30;48;2;102;194;165m[CLS][0m [0;30;48;2;252;141;98menglish[0m [0;30;48;2;141;160;203mand[0m [0;30;48;2;231;138;195mcapital[0m [0;30;48;2;166;216;84m##ization[0m [0;30;48;2;255;217;47m[UNK][0m [0;30;48;2;102;194;165m[UNK][0m [0;30;48;2;252;141;98mshow[0m [0;30;48;2;141;160;203m_[0m [0;30;48;2;231;138;195mtoken[0m [0;30;48;2;166;216;84m##s[0m [0;30;48;2;255;217;47mfalse[0m [0;30;48;2;102;194;165mnone[0m [0;30;48;2;252;141;98meli[0m [0;30;48;2;141;160;203m##f[0m [0;30;48;2;231;138;195m=[0m [0;30;48;2;166;216;84m=[0m [0;30;48;2;255;217;47m>[0m [0;30;48;2;102;194;165m=[0m [0;30;48;2;252;141;98melse[0m [0;30;48;2;141;160;203m:[0m [0;30;48;2;231;138;195mtwo[0m [0;30;48;2;166;216;84mtab[0m [0;30;48;2;255;217;47m##s[0m [0;30;48;2;102;194;165m:[0m [0;30;48;2;252;141;98m"[0m [0;30;48;2;141;160;203m"[0m [0;30;48;2;231;138;195mthree[0m [0;30;48;2;166;216;84mtab[0m [0;30;48;2;255;217;47m##s[0m [0;30;48;2;102;194;165m:[0m [0;30;48;2;25

### BERT Base Model (cased, 2018)

In [14]:
show_tokens(text, "bert-base-cased")

[0;30;48;2;102;194;165m[CLS][0m [0;30;48;2;252;141;98mEnglish[0m [0;30;48;2;141;160;203mand[0m [0;30;48;2;231;138;195mCA[0m [0;30;48;2;166;216;84m##PI[0m [0;30;48;2;255;217;47m##TA[0m [0;30;48;2;102;194;165m##L[0m [0;30;48;2;252;141;98m##I[0m [0;30;48;2;141;160;203m##Z[0m [0;30;48;2;231;138;195m##AT[0m [0;30;48;2;166;216;84m##ION[0m [0;30;48;2;255;217;47m[UNK][0m [0;30;48;2;102;194;165m[UNK][0m [0;30;48;2;252;141;98mshow[0m [0;30;48;2;141;160;203m_[0m [0;30;48;2;231;138;195mtoken[0m [0;30;48;2;166;216;84m##s[0m [0;30;48;2;255;217;47mF[0m [0;30;48;2;102;194;165m##als[0m [0;30;48;2;252;141;98m##e[0m [0;30;48;2;141;160;203mNone[0m [0;30;48;2;231;138;195mel[0m [0;30;48;2;166;216;84m##if[0m [0;30;48;2;255;217;47m=[0m [0;30;48;2;102;194;165m=[0m [0;30;48;2;252;141;98m>[0m [0;30;48;2;141;160;203m=[0m [0;30;48;2;231;138;195melse[0m [0;30;48;2;166;216;84m:[0m [0;30;48;2;255;217;47mtwo[0m [0;30;48;2;102;194;165mta[0m [0;30;48;2;252;1

### GPT-2 (2019)

In [15]:
show_tokens(text, "gpt2")

[0;30;48;2;102;194;165m
[0m [0;30;48;2;252;141;98mEnglish[0m [0;30;48;2;141;160;203m and[0m [0;30;48;2;231;138;195m CAP[0m [0;30;48;2;166;216;84mITAL[0m [0;30;48;2;255;217;47mIZ[0m [0;30;48;2;102;194;165mATION[0m [0;30;48;2;252;141;98m
[0m [0;30;48;2;141;160;203mï¿½[0m [0;30;48;2;231;138;195mï¿½[0m [0;30;48;2;166;216;84mï¿½[0m [0;30;48;2;255;217;47m ï¿½[0m [0;30;48;2;102;194;165mï¿½[0m [0;30;48;2;252;141;98mï¿½[0m [0;30;48;2;141;160;203m
[0m [0;30;48;2;231;138;195mshow[0m [0;30;48;2;166;216;84m_[0m [0;30;48;2;255;217;47mt[0m [0;30;48;2;102;194;165mok[0m [0;30;48;2;252;141;98mens[0m [0;30;48;2;141;160;203m False[0m [0;30;48;2;231;138;195m None[0m [0;30;48;2;166;216;84m el[0m [0;30;48;2;255;217;47mif[0m [0;30;48;2;102;194;165m ==[0m [0;30;48;2;252;141;98m >=[0m [0;30;48;2;141;160;203m else[0m [0;30;48;2;231;138;195m:[0m [0;30;48;2;166;216;84m two[0m [0;30;48;2;255;217;47m tabs[0m [0;30;48;2;102;194;165m:"[0m [0;30;48;2;252;14

In [16]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
token_ids = tokenizer(text).input_ids
for token_id in token_ids:
    print(f">>> id: {token_id};\ttoken: {tokenizer.decode(token_id)}")

>>> id: 198;	token: 

>>> id: 15823;	token: English
>>> id: 290;	token:  and
>>> id: 20176;	token:  CAP
>>> id: 40579;	token: ITAL
>>> id: 14887;	token: IZ
>>> id: 6234;	token: ATION
>>> id: 198;	token: 

>>> id: 8582;	token: ï¿½
>>> id: 236;	token: ï¿½
>>> id: 113;	token: ï¿½
>>> id: 16268;	token:  ï¿½
>>> id: 116;	token: ï¿½
>>> id: 253;	token: ï¿½
>>> id: 198;	token: 

>>> id: 12860;	token: show
>>> id: 62;	token: _
>>> id: 83;	token: t
>>> id: 482;	token: ok
>>> id: 641;	token: ens
>>> id: 10352;	token:  False
>>> id: 6045;	token:  None
>>> id: 1288;	token:  el
>>> id: 361;	token: if
>>> id: 6624;	token:  ==
>>> id: 18189;	token:  >=
>>> id: 2073;	token:  else
>>> id: 25;	token: :
>>> id: 734;	token:  two
>>> id: 22524;	token:  tabs
>>> id: 11097;	token: :"
>>> id: 220;	token:  
>>> id: 220;	token:  
>>> id: 220;	token:  
>>> id: 220;	token:  
>>> id: 220;	token:  
>>> id: 366;	token:  "
>>> id: 7683;	token:  Three
>>> id: 22524;	token:  tabs
>>> id: 25;	token: :
>>> id: 366;	token

In [17]:
show_tokens("Papagaio\tVerde\t\tVerde\t\t\tAmarelo  Fim", "gpt2")

[0;30;48;2;102;194;165mP[0m [0;30;48;2;252;141;98map[0m [0;30;48;2;141;160;203maga[0m [0;30;48;2;231;138;195mio[0m [0;30;48;2;166;216;84m	[0m [0;30;48;2;255;217;47mVer[0m [0;30;48;2;102;194;165mde[0m [0;30;48;2;252;141;98m	[0m [0;30;48;2;141;160;203m	[0m [0;30;48;2;231;138;195mVer[0m [0;30;48;2;166;216;84mde[0m [0;30;48;2;255;217;47m	[0m [0;30;48;2;102;194;165m	[0m [0;30;48;2;252;141;98m	[0m [0;30;48;2;141;160;203mAm[0m [0;30;48;2;231;138;195mare[0m [0;30;48;2;166;216;84mlo[0m [0;30;48;2;255;217;47m [0m [0;30;48;2;102;194;165m F[0m [0;30;48;2;252;141;98mim[0m 

### Flan-T5 (2022)

In [18]:
show_tokens(text, "google/flan-t5-small")

[0;30;48;2;102;194;165mEnglish[0m [0;30;48;2;252;141;98mand[0m [0;30;48;2;141;160;203mCA[0m [0;30;48;2;231;138;195mPI[0m [0;30;48;2;166;216;84mTAL[0m [0;30;48;2;255;217;47mIZ[0m [0;30;48;2;102;194;165mATION[0m [0;30;48;2;252;141;98m[0m [0;30;48;2;141;160;203m<unk>[0m [0;30;48;2;231;138;195m[0m [0;30;48;2;166;216;84m<unk>[0m [0;30;48;2;255;217;47mshow[0m [0;30;48;2;102;194;165m_[0m [0;30;48;2;252;141;98mto[0m [0;30;48;2;141;160;203mken[0m [0;30;48;2;231;138;195ms[0m [0;30;48;2;166;216;84mFal[0m [0;30;48;2;255;217;47ms[0m [0;30;48;2;102;194;165me[0m [0;30;48;2;252;141;98mNone[0m [0;30;48;2;141;160;203m[0m [0;30;48;2;231;138;195me[0m [0;30;48;2;166;216;84ml[0m [0;30;48;2;255;217;47mif[0m [0;30;48;2;102;194;165m=[0m [0;30;48;2;252;141;98m=[0m [0;30;48;2;141;160;203m>[0m [0;30;48;2;231;138;195m=[0m [0;30;48;2;166;216;84melse[0m [0;30;48;2;255;217;47m:[0m [0;30;48;2;102;194;165mtwo[0m [0;30;48;2;252;141;98mtab[0m [0;30;48;2;141

### GPT-4 (2023)

In [19]:
# The official is `tiktoken` but this the same tokenizer on the HF platform
show_tokens(text, "Xenova/gpt-4")

[0;30;48;2;102;194;165m
[0m [0;30;48;2;252;141;98mEnglish[0m [0;30;48;2;141;160;203m and[0m [0;30;48;2;231;138;195m CAPITAL[0m [0;30;48;2;166;216;84mIZATION[0m [0;30;48;2;255;217;47m
[0m [0;30;48;2;102;194;165mï¿½[0m [0;30;48;2;252;141;98mï¿½[0m [0;30;48;2;141;160;203mï¿½[0m [0;30;48;2;231;138;195m ï¿½[0m [0;30;48;2;166;216;84mï¿½[0m [0;30;48;2;255;217;47mï¿½[0m [0;30;48;2;102;194;165m
[0m [0;30;48;2;252;141;98mshow[0m [0;30;48;2;141;160;203m_tokens[0m [0;30;48;2;231;138;195m False[0m [0;30;48;2;166;216;84m None[0m [0;30;48;2;255;217;47m elif[0m [0;30;48;2;102;194;165m ==[0m [0;30;48;2;252;141;98m >=[0m [0;30;48;2;141;160;203m else[0m [0;30;48;2;231;138;195m:[0m [0;30;48;2;166;216;84m two[0m [0;30;48;2;255;217;47m tabs[0m [0;30;48;2;102;194;165m:"[0m [0;30;48;2;252;141;98m     [0m [0;30;48;2;141;160;203m "[0m [0;30;48;2;231;138;195m Three[0m [0;30;48;2;166;216;84m tabs[0m [0;30;48;2;255;217;47m:[0m [0;30;48;2;102;194;165m "[

In [20]:
tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4")
textT = "Alex Araujo"
for n_spaces in range(1, 8 + 1):
    text_ws = textT.replace(" ", " " * n_spaces)
    token_ids = tokenizer(text_ws).input_ids
    print(f">>> Using {n_spaces} spaces between words:")
    print(f"*\tText:\t'{text_ws}'")
    print(f"*\tIDs:\t{token_ids}")
    

>>> Using 1 spaces between words:
*	Text:	'Alex Araujo'
*	IDs:	[28487, 88469, 84, 7453]
>>> Using 2 spaces between words:
*	Text:	'Alex  Araujo'
*	IDs:	[28487, 220, 88469, 84, 7453]
>>> Using 3 spaces between words:
*	Text:	'Alex   Araujo'
*	IDs:	[28487, 256, 88469, 84, 7453]
>>> Using 4 spaces between words:
*	Text:	'Alex    Araujo'
*	IDs:	[28487, 262, 88469, 84, 7453]
>>> Using 5 spaces between words:
*	Text:	'Alex     Araujo'
*	IDs:	[28487, 257, 88469, 84, 7453]
>>> Using 6 spaces between words:
*	Text:	'Alex      Araujo'
*	IDs:	[28487, 415, 88469, 84, 7453]
>>> Using 7 spaces between words:
*	Text:	'Alex       Araujo'
*	IDs:	[28487, 996, 88469, 84, 7453]
>>> Using 8 spaces between words:
*	Text:	'Alex        Araujo'
*	IDs:	[28487, 286, 88469, 84, 7453]


### Galactica

In [21]:
# You need to request access before being able to use this tokenizer
show_tokens(text, "bigcode/starcoder2-15b")

[0;30;48;2;102;194;165m
[0m [0;30;48;2;252;141;98mEnglish[0m [0;30;48;2;141;160;203m and[0m [0;30;48;2;231;138;195m CAPITAL[0m [0;30;48;2;166;216;84mIZATION[0m [0;30;48;2;255;217;47m
[0m [0;30;48;2;102;194;165mï¿½[0m [0;30;48;2;252;141;98mï¿½[0m [0;30;48;2;141;160;203mï¿½[0m [0;30;48;2;231;138;195m [0m [0;30;48;2;166;216;84mï¿½[0m [0;30;48;2;255;217;47mï¿½[0m [0;30;48;2;102;194;165m
[0m [0;30;48;2;252;141;98mshow[0m [0;30;48;2;141;160;203m_[0m [0;30;48;2;231;138;195mtokens[0m [0;30;48;2;166;216;84m False[0m [0;30;48;2;255;217;47m None[0m [0;30;48;2;102;194;165m elif[0m [0;30;48;2;252;141;98m ==[0m [0;30;48;2;141;160;203m >=[0m [0;30;48;2;231;138;195m else[0m [0;30;48;2;166;216;84m:[0m [0;30;48;2;255;217;47m two[0m [0;30;48;2;102;194;165m tabs[0m [0;30;48;2;252;141;98m:"[0m [0;30;48;2;141;160;203m     [0m [0;30;48;2;231;138;195m "[0m [0;30;48;2;166;216;84m Three[0m [0;30;48;2;255;217;47m tabs[0m [0;30;48;2;102;194;165m:[0m [

In [22]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-15b")
textT = " Recurrent neural net works, long short-term memory."
token_ids = tokenizer(textT).input_ids
for token_id in token_ids:
    print(f">>> id: {token_id};\ttoken: {tokenizer.decode(token_id)}")

>>> id: 922;	token:  Re
>>> id: 1719;	token: current
>>> id: 25565;	token:  neural
>>> id: 3723;	token:  net
>>> id: 4864;	token:  works
>>> id: 49;	token: ,
>>> id: 1964;	token:  long
>>> id: 4514;	token:  short
>>> id: 50;	token: -
>>> id: 3908;	token: term
>>> id: 4137;	token:  memory
>>> id: 51;	token: .


### Phi-3 (and Llama 2)

In [23]:
show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

[0;30;48;2;102;194;165m[0m [0;30;48;2;252;141;98m
[0m [0;30;48;2;141;160;203mEnglish[0m [0;30;48;2;231;138;195mand[0m [0;30;48;2;166;216;84mC[0m [0;30;48;2;255;217;47mAP[0m [0;30;48;2;102;194;165mIT[0m [0;30;48;2;252;141;98mAL[0m [0;30;48;2;141;160;203mIZ[0m [0;30;48;2;231;138;195mATION[0m [0;30;48;2;166;216;84m
[0m [0;30;48;2;255;217;47mï¿½[0m [0;30;48;2;102;194;165mï¿½[0m [0;30;48;2;252;141;98mï¿½[0m [0;30;48;2;141;160;203mï¿½[0m [0;30;48;2;231;138;195m[0m [0;30;48;2;166;216;84mï¿½[0m [0;30;48;2;255;217;47mï¿½[0m [0;30;48;2;102;194;165mï¿½[0m [0;30;48;2;252;141;98m
[0m [0;30;48;2;141;160;203mshow[0m [0;30;48;2;231;138;195m_[0m [0;30;48;2;166;216;84mto[0m [0;30;48;2;255;217;47mkens[0m [0;30;48;2;102;194;165mFalse[0m [0;30;48;2;252;141;98mNone[0m [0;30;48;2;141;160;203melif[0m [0;30;48;2;231;138;195m==[0m [0;30;48;2;166;216;84m>=[0m [0;30;48;2;255;217;47melse[0m [0;30;48;2;102;194;165m:[0m [0;30;48;2;252;141;98mtwo[0m [0;

## Token Embeddings

### A Language Model Holds Embeddings for the Vocabulary of Its Tokenizer

### Creating Contextualized Word Embeddings with Language Models

In [24]:
# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# Load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# Tokenize the sentence
tokens: Dict[str, torch.Tensor] = tokenizer("Hello world", return_tensors="pt")

# Beautiful print for the dictionary
print(f"\n>>> Tokens:")
for k, v in tokens.items():
    print(f"* {k}: {v}")

# Process the tokens
output: torch.Tensor = model(**tokens)[0]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



>>> Tokens:
* input_ids: tensor([[    1, 31414,   232,     2]])
* token_type_ids: tensor([[0, 0, 0, 0]])
* attention_mask: tensor([[1, 1, 1, 1]])


In [25]:
# Show output
print(f">>> Output shape: {output.shape}")

>>> Output shape: torch.Size([1, 4, 384])


In [26]:
for token in tokens["input_ids"][0]: 
    print(tokenizer.decode(token))

[CLS]
Hello
 world
[SEP]


## Text Embeddings (for Sentences and Whole Documents)

In [27]:
# Load model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") 

# Convert text to text embeddings
vector: np.array = model.encode("Best movie ever!")

print(f">>> Text embedding vector shape: {vector.shape}")

>>> Text embedding vector shape: (768,)


## Word Embeddings Beyond LLMs

### Using pretrained Word Embeddings

In [28]:
# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50) # Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")

In [29]:
word: str = "economic"
model.most_similar([model[word]], topn=11)

[('economic', 1.0),
 ('economy', 0.8861514329910278),
 ('global', 0.8344723582267761),
 ('crisis', 0.8282359838485718),
 ('stability', 0.8191476464271545),
 ('policy', 0.8185113072395325),
 ('impact', 0.8170558214187622),
 ('strengthening', 0.8146171569824219),
 ('growth', 0.8080626130104065),
 ('economies', 0.8065662980079651),
 ('financial', 0.8058508038520813)]

### The Word2vec Algorithm and Contrastive Training

## Embeddings for Recommendation Systems

### Recommending Songs by Embeddings

### Training a Song Embedding Model

In [30]:
# Get the playlist dataset file
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')

# Parse the playlist dataset file. Skip the first two lines as
# they only contain metadata
lines = data.read().decode("utf-8").split('\n')[2:]

# Remove playlists with only one song
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

# Dataset shape.
print(f">>> Dataset shape: {songs_df.shape}")

# Show the first few rows of the songs dataset
songs_df.head()

>>> Dataset shape: (75263, 2)


Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Gucci Time (w\/ Swizz Beatz),Gucci Mane
1,Aston Martin Music (w\/ Drake & Chrisette Mich...,Rick Ross
2,Get Back Up (w\/ Chris Brown),T.I.
3,Hot Toddy (w\/ Jay-Z & Ester Dean),Usher
4,Whip My Hair,Willow


In [31]:
# Visualize some playlists
print(f'Playlist #1:\t{playlists[0]}')
print(f'Playlist #2:\t{playlists[1]}')

Playlist #1:	['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43']
Playlist #2:	['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117', '118'

In [32]:
%%time

# Train our Word2Vec model
model = Word2Vec(
    sentences=playlists, 
    vector_size=32, 
    window=20, 
    negative=50, 
    min_count=1, 
    workers=4
)

CPU times: user 1min 16s, sys: 29.1 ms, total: 1min 16s
Wall time: 20.2 s


In [33]:
song_id: int = 2172

# Ask the model for songs similar to `song_id`
model.wv.most_similar(positive=str(song_id))

[('3167', 0.9992006421089172),
 ('2976', 0.9974461793899536),
 ('3116', 0.9966298341751099),
 ('3094', 0.9966042041778564),
 ('2640', 0.9965679049491882),
 ('5586', 0.9964846968650818),
 ('6685', 0.9964772462844849),
 ('2849', 0.9963294863700867),
 ('5549', 0.9955024123191833),
 ('6658', 0.9949690103530884)]

In [34]:
songs_df.iloc[[2172]]

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2172,Fade To Black,Metallica


In [35]:
def print_recommendations(song_id, topn=12):
    
    similar_songs = np.array(model.wv.most_similar(positive=str(song_id),topn=topn))[:,0]
    scores = np.array(model.wv.most_similar(positive=str(song_id),topn=topn))[:,1]

    output = songs_df.iloc[similar_songs].copy()
    output[f"similarity_score_to_{song_id}"] = scores

    return output

In [36]:
# Extract recommendations
print_recommendations(2172)

Unnamed: 0_level_0,title,artist,similarity_score_to_2172
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3167,Unchained,Van Halen,0.9992006421089172
2976,I Don't Know,Ozzy Osbourne,0.9974461793899536
3116,Communication Breakdown,Led Zeppelin,0.99662983417511
3094,Breaking The Law,Judas Priest,0.9966042041778564
2640,Red Barchetta,Rush,0.9965679049491882
5586,The Last In Line,Dio,0.9964846968650818
6685,The Trooper,Iron Maiden,0.9964772462844848
2849,Run To The Hills,Iron Maiden,0.9963294863700868
5549,November Rain,Guns N' Roses,0.9955024123191832
6658,(Bang Your Head) Metal Health,Quiet Riot,0.9949690103530884
