Assignment:

Write functions for the following text encoding.

1. Index-based encoding

2. Onehot encoding

3. Bag-of-word encoding

4. TF-IDF encoding

5. Reduced Co-occurrence matrix (with window-size and reduced k-dimension parameters)

Example: input_text = ["all that glitters is not gold"]
                      ["one for all and all for one"]

In [None]:
def simple_tokenize(text):
    tokens = text.split()
    return tokens
# Example usage
text1 = "all that glitters is not gold"
text2 = "one for all and all for one"
tokens1 = simple_tokenize(text1)
tokens2 = simple_tokenize(text2)
# Print the result
print("Tokens for text 1:", tokens1)
print("Tokens for text 2:", tokens2)

Tokens for text 1: ['all', 'that', 'glitters', 'is', 'not', 'gold']
Tokens for text 2: ['one', 'for', 'all', 'and', 'all', 'for', 'one']


###1. Index-based encoding

In [None]:
def simple_tokenize(text):
    tokens = text.split()
    return tokens
def index_encoding(input_text):
    # Create a vocabulary of unique words in the entire input_text
    vocabulary = set(word for tokens in input_text for word in tokens)
    # Create a dictionary to map each word to its index
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    # Create index-based encodings for each string in the input_text
    index_encodings = []
    for tokens in input_text:
        encoding = [word_to_index[word] for word in tokens]
        index_encodings.append(encoding)
    return index_encodings, word_to_index

In [None]:
# Example usage
text1 = "all that glitters is not gold"
text2 = "one for all and all for one"
tokens1 = simple_tokenize(text1)
tokens2 = simple_tokenize(text2)
index_encodings, vocabulary = index_encoding([tokens1, tokens2])
# Print the result
for i, encoding in enumerate(index_encodings):
    print(f"Index-based encoding for string {i+1}: {encoding}")
# Print the vocabulary
"Vocabulary:", vocabulary

Index-based encoding for string 1: [3, 4, 6, 2, 8, 7]
Index-based encoding for string 2: [1, 5, 3, 0, 3, 5, 1]


('Vocabulary:',
 {'and': 0,
  'one': 1,
  'is': 2,
  'all': 3,
  'that': 4,
  'for': 5,
  'glitters': 6,
  'gold': 7,
  'not': 8})

###2. Onehot encoding

In [None]:
def onehot_enc(input_text):

  # สร้างลิสต์ของคำทั้งหมดในข้อความ
  all_words = ' '.join(input_text).split()

# ข้อความตัวอย่าง
text1 = "all that glitters is not gold"
text2 = "one for all and all for one"

# สร้างลิสต์ของคำทั้งหมด
words = (text1 + " " + text2).split()

# สร้างดิกชันนารีเพื่อกำหนด index ให้แต่ละคำ
word_to_index = {word: idx for idx, word in enumerate(set(words))}

# สร้างเวกเตอร์ One-hot Encoding
one_hot_vectors = {word: [1 if i == word_to_index[word] else 0 for i in range(len(word_to_index))] for word in word_to_index}

# แสดงผลลัพธ์
for word, vector in one_hot_vectors.items():
    print(f"{word}: {vector}")

and: [1, 0, 0, 0, 0, 0, 0, 0, 0]
one: [0, 1, 0, 0, 0, 0, 0, 0, 0]
is: [0, 0, 1, 0, 0, 0, 0, 0, 0]
all: [0, 0, 0, 1, 0, 0, 0, 0, 0]
that: [0, 0, 0, 0, 1, 0, 0, 0, 0]
for: [0, 0, 0, 0, 0, 1, 0, 0, 0]
glitters: [0, 0, 0, 0, 0, 0, 1, 0, 0]
gold: [0, 0, 0, 0, 0, 0, 0, 1, 0]
not: [0, 0, 0, 0, 0, 0, 0, 0, 1]


###3. Bag-of-word encoding

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def bow_encoding(input_text):
    # Use CountVectorizer to create Bag-of-Words Encoding
    vectorizer = CountVectorizer()
    bow_encoding = vectorizer.fit_transform(input_text)

    # Convert to DataFrame
    df_bow = pd.DataFrame(bow_encoding.toarray(), columns=vectorizer.get_feature_names_out())

    # Add a column with the original sentences
    df_bow.insert(0, 'Sentences', input_text)

    return df_bow

# Example
input_text = ["all that glitters is not gold", "one for all and all for one"]
result = bow_encoding(input_text)

# Display the result
print(result.to_string(index=False))


                    Sentences  all  and  for  glitters  gold  is  not  one  that
all that glitters is not gold    1    0    0         1     1   1    1    0     1
  one for all and all for one    2    1    2         0     0   0    0    2     0


###4. TF-IDF encoding

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_encoding(input_text):
    # Use TfidfVectorizer to create TF-IDF Encoding
    vectorizer = TfidfVectorizer()
    tfidf_encoding = vectorizer.fit_transform(input_text)

    # Convert to DataFrame
    df_tfidf = pd.DataFrame(tfidf_encoding.toarray(), columns=vectorizer.get_feature_names_out())

    # Add a column with the original sentences
    df_tfidf.insert(0, 'Sentences', input_text)

    return df_tfidf

In [None]:
# Example
input_text = [ "all that glitters is not gold", "one for all and all for one"]
result = tfidf_encoding(input_text)

# Display the result
print(result.to_string(index=False))

                    Sentences      all     and      for  glitters    gold      is     not      one    that
all that glitters is not gold 0.303216 0.00000 0.000000   0.42616 0.42616 0.42616 0.42616 0.000000 0.42616
  one for all and all for one 0.428569 0.30117 0.602339   0.00000 0.00000 0.00000 0.00000 0.602339 0.00000


###5. Co-occurrence matrix

Reduced Co-occurrence Matrix

ฟังก์ชัน co_occurrence_matrix จะทำการสร้าง Co-occurrence Matrix ที่ลดรูป (Reduced) โดยให้พารามิเตอร์ window_size กำหนดขนาดของหน้าต่าง (window) และ k_dimension กำหนดมิติที่จะลดลงมา.

In [None]:
import numpy as np
import pandas as pd

def co_occurrence_matrix(input_text, window_size=1, k_dimension=2):
    # Tokenize input_text
    tokens = [sentence.split() for sentence in input_text]

    # Create vocabulary
    vocabulary = list(set([word for sentence in tokens for word in sentence]))

    # Create coo_dict for co-occurrence matrix
    coo_dict = {word: {other_word: 0 for other_word in vocabulary} for word in vocabulary}

    # Update coo_dict based on co-occurrence counts
    for sentence in tokens:
        for i, word1 in enumerate(sentence):
            for j, word2 in enumerate(sentence[max(0, i - window_size):i + window_size + 1]):
                if i != j:
                    coo_dict[word1][word2] += 1

    # Convert coo_dict to DataFrame
    df_cooc = pd.DataFrame(coo_dict).T.astype('int')

    return df_cooc

# Example
input_text = ["all that glitters is not gold", "one for all and all for one"]
result = co_occurrence_matrix(input_text, window_size=2, k_dimension=3)

# Display the result
print(result)


          and  one  is  all  that  for  glitters  gold  not
and         1    0   0    1     0    2         0     0    0
one         0    1   0    2     0    2         0     0    0
is          0    0   1    0     1    0         1     1    0
all         2    1   0    3     1    2         1     0    0
that        0    0   1    1     0    0         1     0    0
for         2    2   0    2     0    1         0     0    0
glitters    0    0   1    1     1    0         0     0    1
gold        0    0   1    0     0    0         0     1    1
not         0    0   1    0     0    0         1     1    1
