In [None]:
# Without Library
# One-Hot Encoding
def one_hot_encoding(data):
    unique_values = list(set(data))
    encoding = []
    for val in data:
        encoding.append([1 if val == u else 0 for u in unique_values])
    return encoding

# Count Vectorizer
def count_vectorizer(data):
    unique_words = list(set(data))
    vectors = []
    for sentence in data:
        vector = [0] * len(unique_words)
        for word in sentence.split():
            if word in unique_words:
                vector[unique_words.index(word)] += 1
        vectors.append(vector)
    return vectors

# TF-IDF (Term Frequency-Inverse Document Frequency)
def tf_idf(data):
    unique_words = list(set([word for sentence in data for word in sentence.split()]))
    idf = {}
    for word in unique_words:
        doc_count = sum(1 for sentence in data if word in sentence)
        idf[word] = np.log(len(data) / (1 + doc_count))
    
    tfidf_matrix = []
    for sentence in data:
        tfidf_vector = []
        for word in unique_words:
            tf = sentence.split().count(word) / len(sentence.split())
            tfidf_vector.append(tf * idf[word])
        tfidf_matrix.append(tfidf_vector)
    return tfidf_matrix

# Example usage
data = ["hello world", "hello", "world", "hello again"]
print("Original Data:", data)

# One-Hot Encoding
one_hot_encoded = one_hot_encoding(data)
print("\nOne-Hot Encoded:")
for sentence in one_hot_encoded:
    print(sentence)

# Count Vectorizer
count_vectors = count_vectorizer(data)
print("\nCount Vectors:")
for vector in count_vectors:
    print(vector)

# TF-IDF
tfidf_matrix = tf_idf(data)
print("\nTF-IDF Matrix:")
for vector in tfidf_matrix:
    print(vector)


In [None]:
# With Library
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Example data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# One-hot encoding
vectorizer = CountVectorizer(binary=True)
one_hot_encoded = vectorizer.fit_transform(documents)
print("One-Hot Encoded Features:")
print(one_hot_encoded.toarray())
print("Vocabulary:")
print(vectorizer.get_feature_names_out())

# Count Vectorizer
count_vectorizer = CountVectorizer()
count_vectorized = count_vectorizer.fit_transform(documents)
print("\nCount Vectorized Features:")
print(count_vectorized.toarray())
print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorized = tfidf_vectorizer.fit_transform(documents)
print("\nTF-IDF Vectorized Features:")
print(tfidf_vectorized.toarray())
print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())
