## 1. N-grams

#### 1.1. Bag of words (BOW) -> Uni-gram

In [None]:
import pandas as pd
#importing the pandas

In [3]:
df1=pd.DataFrame({'text':['people watch campusx','campusx watch campusx','people write comment','campusx write comment'],'output':[1,1,0,0]})

In [9]:
df1

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [20]:
bow=cv.fit_transform(df1['text'])

In [None]:
#vocab in alphabatetical order
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [22]:
print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]


In [25]:
cv.transform(['campusx watch and write comment of campusx']).toarray()
# oov(out of vocabulary) words are handled

array([[2, 1, 0, 1, 1]])

#### 1.2. Bag of n-grams

In [None]:
# Bi-gram
cv2=CountVectorizer(ngram_range=(2,2))
bow2=cv2.fit_transform(df1['text'])

In [28]:
print(cv2.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comment': 5, 'campusx write': 1}


In [30]:
print(bow2[0].toarray())
print(bow2[1].toarray())
cv2.transform(['campusx watch and write comment of campusx']).toarray()

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]


array([[1, 0, 0, 0, 0, 1]])

In [31]:
# Bi-gram + tri-gram +uni-gram
cv3=CountVectorizer(ngram_range=(1,3))
bow3=cv3.fit_transform(df1['text'])

In [33]:
print(cv3.vocabulary_)
print(len(cv3.vocabulary_))

{'people': 6, 'watch': 11, 'campusx': 0, 'people watch': 7, 'watch campusx': 12, 'people watch campusx': 8, 'campusx watch': 1, 'campusx watch campusx': 2, 'write': 13, 'comment': 5, 'people write': 9, 'write comment': 14, 'people write comment': 10, 'campusx write': 3, 'campusx write comment': 4}
15


In [34]:
print(bow3[0].toarray())
print(bow3[1].toarray())

[[1 0 0 0 0 0 1 1 1 0 0 1 1 0 0]]
[[2 1 1 0 0 0 0 0 0 0 0 1 1 0 0]]


## 2. Tf-Idf (Term frequency - Inverse document frequency)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
tfidf.fit_transform(df1['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [37]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campusx' 'comment' 'people' 'watch' 'write']


## 3. Ohe (One hot encoding)

#### Without using sklearn

In [None]:
vocab=[]
for i in range(len(df1['text'])):
    for word in df1['text'][i].split():
        if word in vocab:
            continue
        else:
            vocab.append(word)
print (vocab)

['people', 'watch', 'campusx', 'write', 'comment']


In [44]:
df_encoded = pd.get_dummies(vocab, dtype=int)
print(df_encoded)

   campusx  comment  people  watch  write
0        0        0       1      0      0
1        0        0       0      1      0
2        1        0       0      0      0
3        0        0       0      0      1
4        0        1       0      0      0


In [50]:
print(df_encoded['campusx'].to_numpy())

[0 0 1 0 0]


In [51]:
arr=[]
def ohe (text):
    for word in text.split():
       arr.append(df_encoded[word].to_numpy()) 
    return arr

In [54]:
df1['text'].apply(ohe)

0    [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0...
1    [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0...
2    [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0...
3    [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0...
Name: text, dtype: object

#### Using sklearn

In [61]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample sentences
sentences = ["This is the first sentence.", "This is the second sentence."]

# Tokenize and create vocabulary
words = " ".join(sentences).split()
unique_words = list(set(words))

# Create word-to-index mapping
word_to_index = {word: index for index, word in enumerate(unique_words)}

# Convert sentences to numerical representations
numerical_sentences = [[word_to_index[word] for word in sentence.split()] for sentence in sentences]

# One-hot encode the sentences
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_sentences = ohe.fit_transform(np.array(numerical_sentences).reshape(-1, 1))

# Print the encoded sentences
print(encoded_sentences)

# Get feature names
feature_names = ohe.get_feature_names_out(['words'])
print(feature_names)

[[0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
['words_0' 'words_1' 'words_2' 'words_3' 'words_4' 'words_5']
