In [0]:
corpus = [
     'My cat is white',
     'I am the major of this city',
     'I love eating toasted cheese',
     'The lazy cat is sleeping',
]


In [3]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
corpus_tokens = []
for c in corpus:
    doc = nlp(c)
    tokens = []
    for t in doc:
        tokens.append(t.text)
    corpus_tokens.append(tokens)
corpus_tokens


[['My', 'cat', 'is', 'white'],
 ['I', 'am', 'the', 'major', 'of', 'this', 'city'],
 ['I', 'love', 'eating', 'toasted', 'cheese'],
 ['The', 'lazy', 'cat', 'is', 'sleeping']]

In [4]:
processed_corpus = [t for sentence in corpus_tokens for t in sentence]
processed_corpus = set(processed_corpus)
processed_corpus


{'I',
 'My',
 'The',
 'am',
 'cat',
 'cheese',
 'city',
 'eating',
 'is',
 'lazy',
 'love',
 'major',
 'of',
 'sleeping',
 'the',
 'this',
 'toasted',
 'white'}

In [6]:
import numpy as np
sentence = 'My cat is lazy'
tokenized_sentence = sentence.split()
encoded_sentence = np.zeros([len(tokenized_sentence),len(processed_corpus)])
encoded_sentence



array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [9]:
word2int = dict([(tok, pos) for pos, tok in enumerate(processed_corpus)])
for i,c in enumerate(sentence.split()):
    encoded_sentence[i][ word2int[c] ] = 1
encoded_sentence

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [10]:
print("Shape of the encoded sentence:", encoded_sentence.shape)

Shape of the encoded sentence: (4, 18)


In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

le = LabelEncoder()

labeled_corpus = le.fit_transform(list(processed_corpus))
labeled_corpus


array([10,  9,  5, 13,  0, 14,  6,  7, 16, 15, 11,  2, 12,  1,  8,  3, 17,
        4])

In [14]:
sentence = 'My cat is lazy'
tokenized_sentence = sentence.split()
integer_encoded = le.transform(tokenized_sentence)
integer_encoded


array([1, 4, 8, 9])

In [15]:
le.inverse_transform(integer_encoded)

array(['My', 'cat', 'is', 'lazy'], dtype='<U8')

In [0]:
onehot_encoder = OneHotEncoder(sparse=False)

In [17]:
labeled_corpus = labeled_corpus.reshape(len(labeled_corpus), 1)
onehot_encoded = onehot_encoder.fit(labeled_corpus)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
sentence_encoded = onehot_encoded.transform(integer_encoded.reshape(len(integer_encoded), 1))
print(sentence_encoded)


[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
