# 1 one hot encoding

In [3]:
import numpy as np
import pandas as pd

In [49]:
# Sample text data
data = {
    'id': [1, 2, 3, 4],
    'text': [
        "I love machine learning",
        "Machine learning is amazing",
        "Natural language processing is part of machine learning",
        "I love learning new things"
    ]
}

df = pd.DataFrame(data)
print(df)


   id                                               text
0   1                            I love machine learning
1   2                        Machine learning is amazing
2   3  Natural language processing is part of machine...
3   4                         I love learning new things


In [51]:
from sklearn.feature_extraction.text import CountVectorizer

# One-Hot Encoding using CountVectorizer (binary=True)
vectorizer = CountVectorizer(binary=True)
X_onehot = vectorizer.fit_transform(df['text'])

# Convert to DataFrame for easy viewing
onehot_df = pd.DataFrame(X_onehot.toarray(), columns=vectorizer.get_feature_names_out())
print(onehot_df)


   amazing  is  language  learning  love  machine  natural  new  of  part  \
0        0   0         0         1     1        1        0    0   0     0   
1        1   1         0         1     0        1        0    0   0     0   
2        0   1         1         1     0        1        1    0   1     1   
3        0   0         0         1     1        0        0    1   0     0   

   processing  things  
0           0       0  
1           0       0  
2           1       0  
3           0       1  


# 2 Bag of Words

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Convert to DataFrame for easy viewing
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)

   amazing  is  language  learning  love  machine  natural  new  of  part  \
0        0   0         0         1     1        1        0    0   0     0   
1        1   1         0         1     0        1        0    0   0     0   
2        0   1         1         1     0        1        1    0   1     1   
3        0   0         0         1     1        0        0    1   0     0   

   processing  things  
0           0       0  
1           0       0  
2           1       0  
3           0       1  


In [9]:
vectorizer.transform(["I love machine learning deep learning"]).toarray()

array([[0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

Note: Words like "deep" won't appear unless they're in the vocabulary (fitted corpus). Since "deep" isn't in the original training data, it will be ignored.

In [14]:
vectorizer1 = CountVectorizer(binary=True)

# Fit on your original DataFrame text
vectorizer1.fit(df['text'])

# Transform a new sentence
vectorizer1.transform(["I love machine learning deep learning"]).toarray()


array([[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

# n grams

### bi-grams

In [26]:
# Use bigrams (2-grams) and binary=True
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit on original text data
vectorizer.fit(df['text'])

# Transform a new sentence
X_test = vectorizer.transform(["I love machine learning deep learning"]).toarray()

print(vectorizer.vocabulary_)
# Output: {'i love': 0, 'love machine': 1, 'machine learning': 2, ...} in dict form

# Show the feature names for clarity
# print(vectorizer.get_feature_names_out())
print(X_test)

{'love machine': 6, 'machine learning': 7, 'learning is': 3, 'is amazing': 0, 'natural language': 8, 'language processing': 2, 'processing is': 12, 'is part': 1, 'part of': 11, 'of machine': 10, 'love learning': 5, 'learning new': 4, 'new things': 9}
[[0 0 0 0 0 0 1 1 0 0 0 0 0]]


In [28]:
# Use bigrams (2-grams) and uni-gram(single)
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Fit on original text data
vectorizer.fit(df['text'])

# Transform a new sentence
X_test = vectorizer.transform(["I love machine learning deep learning"]).toarray()

print(vectorizer.vocabulary_)
print(X_test)

{'love': 9, 'machine': 12, 'learning': 6, 'love machine': 11, 'machine learning': 13, 'is': 1, 'amazing': 0, 'learning is': 7, 'is amazing': 2, 'natural': 14, 'language': 4, 'processing': 22, 'part': 20, 'of': 18, 'natural language': 15, 'language processing': 5, 'processing is': 23, 'is part': 3, 'part of': 21, 'of machine': 19, 'new': 16, 'things': 24, 'love learning': 10, 'learning new': 8, 'new things': 17}
[[0 0 0 0 0 0 2 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]]


In [30]:
# Use tri-grams (2-grams)
vectorizer = CountVectorizer(ngram_range=(3,3))

# Fit on original text data
vectorizer.fit(df['text'])

# Transform a new sentence
X_test = vectorizer.transform(["I love machine learning deep learning"]).toarray()

print(vectorizer.vocabulary_)
print(X_test)

{'love machine learning': 5, 'machine learning is': 6, 'learning is amazing': 2, 'natural language processing': 7, 'language processing is': 1, 'processing is part': 10, 'is part of': 0, 'part of machine': 9, 'of machine learning': 8, 'love learning new': 4, 'learning new things': 3}
[[0 0 0 0 0 1 0 0 0 0 0]]


In [32]:
# Use tri-grams (3-grams) and uni-gram(single)
vectorizer = CountVectorizer(ngram_range=(1, 3))

# Fit on original text data
vectorizer.fit(df['text'])

# Transform a new sentence
X_test = vectorizer.transform(["I love machine learning deep learning"]).toarray()

print(vectorizer.vocabulary_)
print(X_test)

{'love': 13, 'machine': 18, 'learning': 8, 'love machine': 16, 'machine learning': 19, 'love machine learning': 17, 'is': 1, 'amazing': 0, 'learning is': 9, 'is amazing': 2, 'machine learning is': 20, 'learning is amazing': 10, 'natural': 21, 'language': 5, 'processing': 32, 'part': 29, 'of': 26, 'natural language': 22, 'language processing': 6, 'processing is': 33, 'is part': 3, 'part of': 30, 'of machine': 27, 'natural language processing': 23, 'language processing is': 7, 'processing is part': 34, 'is part of': 4, 'part of machine': 31, 'of machine learning': 28, 'new': 24, 'things': 35, 'love learning': 14, 'learning new': 11, 'new things': 25, 'love learning new': 15, 'learning new things': 12}
[[0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = vectorizer.fit_transform(df['text'])

print(vectorizer.idf_)
print(vectorizer.get_feature_names_out())

[1.91629073 1.51082562 1.91629073 1.         1.51082562 1.22314355
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073]
['amazing' 'is' 'language' 'learning' 'love' 'machine' 'natural' 'new'
 'of' 'part' 'processing' 'things']


In [45]:
# Convert to DataFrame for easier viewing
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print(tfidf_df)

    amazing        is  language  learning      love   machine   natural  \
0  0.000000  0.000000  0.000000  0.457453  0.691131  0.559530  0.000000   
1  0.659191  0.519714  0.000000  0.343993  0.000000  0.420753  0.000000   
2  0.000000  0.314078  0.398368  0.207885  0.000000  0.254273  0.398368   
3  0.000000  0.000000  0.000000  0.306758  0.463458  0.000000  0.000000   

        new        of      part  processing    things  
0  0.000000  0.000000  0.000000    0.000000  0.000000  
1  0.000000  0.000000  0.000000    0.000000  0.000000  
2  0.000000  0.398368  0.398368    0.398368  0.000000  
3  0.587838  0.000000  0.000000    0.000000  0.587838  
