<a href="https://colab.research.google.com/github/OTN-Rajapaksha/One-Hot-Encoding/blob/main/One_Hot_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **One-Hot-Encoding**


🧠 What is One-Hot Encoding?

One-Hot Encoding is a method used to convert categorical variables into a numeric format so that they can be used by machine learning algorithms.

In [3]:
def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
    temp = [0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1] = 1
    onehot_encoded.append(temp)
  return onehot_encoded

# Placeholder data for demonstration
processed_docs = ["This is a test sentence", "Another sentence for testing"]
vocab = {"This": 1, "is": 2, "a": 3, "test": 4, "sentence": 5, "Another": 6, "for": 7, "testing": 8}

get_onehot_vector(processed_docs[1])

[[0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 1]]

In [9]:
def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
    temp = [0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1] = 1
    onehot_encoded.append(temp)
  return onehot_encoded

# Placeholder data for demonstration
processed_docs = ["Oshan Thiyanga Nawod Rajapaksha sentence", "Another sentence for testing"]
vocab = {"Oshan": 1, "Thiyanga": 2, "Nawod": 3, "Rajapaksha": 4, "sentence": 5, "Another": 6, "for": 7, "testing": 8}

get_onehot_vector(processed_docs[1])

[[0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 1]]

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'dog bites dog and dog dog runs.',
    'man bites dog.',
    'And this is the third one.',
    'Is this the first document?'
]

# Step 1: Count Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
V = vectorizer.get_feature_names_out()  # Updated method
print("Vocabulary List:\n", V)

# Step 2: Label Encoding
encoder = LabelEncoder()
V_labels = encoder.fit_transform(V)
print("Label Encoded:\n", V_labels)

# Step 3: Reshape to 2D array for OneHotEncoder
V_labels = V_labels.reshape((V_labels.shape[0], 1))
print("Reshaped Labels:\n", V_labels)

# Step 4: One-Hot Encoding
encoder = OneHotEncoder()
y = encoder.fit_transform(V_labels)
print("One-Hot Encoded:\n", y)

Vocabulary List:
 ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Label Encoded:
 [0 1 2 3 4 5 6 7 8]
Reshaped Labels:
 [[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]]
One-Hot Encoded:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (9, 9)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 8)	1.0


In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'dog bites dog and dog dog runs.',
    'man bites dog.',
]

# Step 1: Count Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
V = vectorizer.get_feature_names_out()  # Updated method
print("Vocabulary List:\n", V)

# Step 2: Label Encoding
encoder = LabelEncoder()
V_labels = encoder.fit_transform(V)
print("Label Encoded:\n", V_labels)

# Step 3: Reshape to 2D array for OneHotEncoder
V_labels = V_labels.reshape((V_labels.shape[0], 1))
print("Reshaped Labels:\n", V_labels)

# Step 4: One-Hot Encoding
encoder = OneHotEncoder()
y = encoder.fit_transform(V_labels)
print("One-Hot Encoded:\n", y)

Vocabulary List:
 ['and' 'bites' 'dog' 'man' 'runs']
Label Encoded:
 [0 1 2 3 4]
Reshaped Labels:
 [[0]
 [1]
 [2]
 [3]
 [4]]
One-Hot Encoded:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (5, 5)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
