In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
x = [["cat", 1],["bat", 1], ["dog", 1]]
enc_df = enc.fit_transform(x).toarray()
print(enc_df)

In [20]:
from sklearn.preprocessing import OneHotEncoder
import itertools

# two example documents
docs = ["cat","dog","bat","ate"]

# split documents to tokens
tokens_docs = [doc.split(" ") for doc in docs]

# convert list of of token-lists to one flat list of tokens
# and then create a dictionary that maps word to id of word,
all_tokens = itertools.chain.from_iterable(tokens_docs)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

# convert token lists to token-id lists, e.g. [[1, 2], [2, 2]] here
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

# convert list of token-id lists to one-hot representation
vec = OneHotEncoder(categories="auto")
X = vec.fit_transform(token_ids)

print(X.toarray())

[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["i love nlp. nlp is so cool"]

vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)
print(vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(vector.toarray())

{'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}
(1, 5)
[[1 1 1 2 1]]


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
text1 = ['i love nlp', 'nlp is so cool', 'nlp is all about helping machines process language', "this tutorial is on baisc nlp technique"]
tf = TfidfVectorizer()
txt_fitted = tf.fit(text1)
txt_transformed = txt_fitted.transform(text1)
print ("The text: ", text1)

The text:  ['i love nlp', 'nlp is so cool', 'nlp is all about helping machines process language', 'this tutorial is on baisc nlp technique']


In [6]:
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf)))

{'about': 1.916290731874155, 'all': 1.916290731874155, 'baisc': 1.916290731874155, 'cool': 1.916290731874155, 'helping': 1.916290731874155, 'is': 1.2231435513142097, 'language': 1.916290731874155, 'love': 1.916290731874155, 'machines': 1.916290731874155, 'nlp': 1.0, 'on': 1.916290731874155, 'process': 1.916290731874155, 'so': 1.916290731874155, 'technique': 1.916290731874155, 'this': 1.916290731874155, 'tutorial': 1.916290731874155}
