# Lesson 2: Embeddings

### Setup
Load needed API keys and relevant Python libaries.

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
import cohere
co = cohere.Client(os.environ['COHERE_API_KEY'])
import pandas as pd
from utils import umap_plot
import numpy as np
from utils import umap_plot_big

## Word Embeddings

Consider a very small dataset of three words.

In [None]:
three_words = pd.DataFrame({'text':
  [
      'joy',
      'happiness',
      'potato'
  ]})

three_words

Let's create the embeddings for the three words:

In [None]:
three_words_emb = co.embed(texts=list(three_words['text']),
                           model='embed-english-v2.0').embeddings

In [None]:
word_1 = three_words_emb[0]
word_2 = three_words_emb[1]
word_3 = three_words_emb[2]

In [None]:
word_1[:10]

## Sentence Embeddings

Consider a very small dataset of three sentences.

In [None]:
sentences = pd.DataFrame({'text':
  [
   'Where is the world cup?',
   'The world cup is in Qatar',
   'What color is the sky?',
   'The sky is blue',
   'Where does the bear live?',
   'The bear lives in the the woods',
   'What is an apple?',
   'An apple is a fruit',
  ]})

sentences

Let's create the embeddings for the three sentences:

In [None]:
emb = co.embed(texts=list(sentences['text']),
               model='embed-english-v2.0').embeddings

# Explore the 10 first entries of the embeddings of the 3 sentences:
for e in emb:
    print(e[:3])

In [None]:
len(emb[0])

In [None]:
#import umap
#import altair as alt

In [None]:
chart = umap_plot(sentences, emb)

In [None]:
chart.interactive()

## Articles Embeddings

In [None]:
wiki_articles = pd.read_pickle('wikipedia.pkl')
wiki_articles

In [None]:
articles = wiki_articles[['title', 'text']]
embeds = np.array([d for d in wiki_articles['emb']])

chart = umap_plot_big(articles, embeds)
chart.interactive()