In [1]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [9]:
import numpy as np

token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab) # ASCII순 정렬

num_tokens = len(vocab) # 10
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
' '.join(vocab)

onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
import pandas as pd

pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [12]:
sentence_bow = {}

for token in sentence.split():
    sentence_bow[token] = 1

sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [16]:
series = pd.Series(dict([(token, 1) for token in sentence.split()]))
df = pd.DataFrame(series, columns=['sent']).T

sentences = "Thomas Jefferson began building Monticello at the age of 26.\n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1970.\n"
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."

corpus = {}
for i, sent in enumerate(sentences.split("\n")):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:10]]

{'sent0': {'Thomas': 1, 'Jefferson': 1, 'began': 1, 'building': 1, 'Monticello': 1, 'at': 1, 'the': 1, 'age': 1, 'of': 1, '26.': 1}, 'sent1': {'Construction': 1, 'was': 1, 'done': 1, 'mostly': 1, 'by': 1, 'local': 1, 'masons': 1, 'and': 1, 'carpenters.': 1}, 'sent2': {'He': 1, 'moved': 1, 'into': 1, 'the': 1, 'South': 1, 'Pavilion': 1, 'in': 1, '1970.': 1}, 'sent3': {'Turning': 1, 'Monticello': 1, 'into': 1, 'a': 1, 'neoclassical': 1, 'masterpiece': 1, 'was': 1, "Jefferson's": 1, 'obsession.': 1}}


Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0


In [53]:
v1 = np.array([1, 2, 3])
v2 = np.array([2, 3, 4])

v1.dot(v2) # 20
(v1 * v2).sum() # 20
sum([x1 * x2 for x1, x2 in zip(v1, v2)]) # 20

v1 @ v2
np.matmul(v1, v2)

20

In [52]:
df.sent0.dot(df.sent1) # 0
df.sent0.dot(df.sent2) # 1
df.sent0.dot(df.sent3) # 1

1

In [55]:
from nltk.tokenize import TreebankWordTokenizer

sentence = "Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987',
 '.']

In [57]:
from nltk.util import ngrams
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokens = sentence.split()
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26.')]

In [59]:
import nltk
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")

print(stop_words[:5])

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hojae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
