In [122]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [123]:
df = pd.read_csv("IMDB_Dataset.csv")

In [124]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# OneHotEncoder (OHE)

In [125]:
encoder = OneHotEncoder()
X = encoder.fit_transform([["Delhi"], ["Mumbai"], ["Delhi"]])

print(X)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (3, 2)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 0)	1.0


In [126]:
encoder = OneHotEncoder(sparse_output=False)
X = encoder.fit_transform([["Delhi"], ["Mumbai"], ["Delhi"]])

print(X)

[[1. 0.]
 [0. 1.]
 [1. 0.]]


In [127]:
# OneHotEncoder by default returns a sparse matrix (sparse_output=True)
# Sparse matrix stores only non-zero values, so it saves memory for large datasets

# sparse_output=False forces the encoder to return a normal NumPy array (dense output)
# Dense output stores all values including zeros, so it is easier to inspect and debug
# but uses more memory for large datasets

# Use sparse_output=True  -> large data, many categories, memory efficient
# Use sparse_output=False -> small data, learning/debugging, easy to view

In [128]:
ohe = OneHotEncoder(sparse_output = False, drop = None, handle_unknown = "ignore")  
# But we actually do drop = "first" to avoid dummy variable trap.
# handle_unknown="ignore" tells OneHotEncoder how to deal with new/unseen categories
# that appear in test data but were not present during training.

# By default: handle_unknown="error"
# This means the encoder will throw an error if it sees a new category.

# With handle_unknown="ignore":
# - New categories are ignored
# - Their encoded vector becomes all zeros
# - The model continues working without crashing

# Example:
# Train data has cities: ["Delhi", "Mumbai"]
# Test data has city: ["Bangalore"]

# Without handle_unknown="ignore" -> ERROR
# With handle_unknown="ignore" -> Encoded as [0, 0] for city features

# Use handle_unknown="ignore" in production pipelines to avoid runtime failures

In [129]:
try:
    ohe.fit_transform(df[['review']])
except Exception as e:
    print("Error:", e)

Error: Unable to allocate 18.5 GiB for an array with shape (50000, 49582) and data type float64


In [130]:
# This error occurs because OneHotEncoder is being applied to raw text data.
# Each unique word becomes a new column, so 50,000 reviews with ~49,000 words
# creates a huge matrix (50000 x 49582) which needs ~18.5 GB of memory.

# One-Hot Encoding is meant for categorical features (like city, gender),
# not for text sentences.

# Bag-of-Words (CountVectorizer) is better for text because:
# - It works at word level instead of sentence level
# - It keeps the matrix sparse (stores only non-zero values)
# - It is memory efficient and scalable for large text datasets

# Therefore, use CountVectorizer or TF-IDF for text, not OneHotEncoder.

In [131]:
data = {
    "text": [
        "people watch campusx",
        "campusx watch campusx",
        "people write comment",
        "campusx write comment"
    ],
    "output": [1, 1, 0, 0]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [132]:
ohe = OneHotEncoder(sparse_output = False, handle_unknown = "ignore", drop = None)
encoded = ohe.fit_transform(df[['text']])
encoded

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [133]:
pd.DataFrame(
    encoded,
    columns = ohe.get_feature_names_out(["text"])
)

Unnamed: 0,text_campusx watch campusx,text_campusx write comment,text_people watch campusx,text_people write comment
0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0


# BoW

In [134]:
from sklearn.feature_extraction.text import CountVectorizer

In [135]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [136]:
cv = CountVectorizer()

In [137]:
bow = cv.fit_transform(df["text"])

In [138]:
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (4, 5)>

In [139]:
print(cv.vocabulary_)
# We can see that the indices are provided in lexicographical manner.
# So, each sentence will be represented as: [frequency of 'campusx', frequency of 'comment', frequency of 'people', frequency of 'watch', frequency of 'write']

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [140]:
print(bow[0].toarray())

[[1 0 1 1 0]]


In [141]:
cv.transform(["hello campusx people"]).toarray()
# As we can see that OOV word 'hello' is silently ignored.

array([[1, 0, 1, 0, 0]], dtype=int64)

In [142]:
bow.toarray()

array([[1, 0, 1, 1, 0],
       [2, 0, 0, 1, 0],
       [0, 1, 1, 0, 1],
       [1, 1, 0, 0, 1]], dtype=int64)

In [143]:
pd.DataFrame(
    bow.toarray(),
    columns = cv.get_feature_names_out()
)

Unnamed: 0,campusx,comment,people,watch,write
0,1,0,1,1,0
1,2,0,0,1,0
2,0,1,1,0,1
3,1,1,0,0,1


# N-grams

In [144]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [145]:
# bigram
cv = CountVectorizer(ngram_range=(2, 2))

In [146]:
bow = cv.fit_transform(df['text'])

In [147]:
cv.vocabulary_

{'people watch': 2,
 'watch campusx': 4,
 'campusx watch': 0,
 'people write': 3,
 'write comment': 5,
 'campusx write': 1}

In [148]:
# trigram
cv = CountVectorizer(ngram_range=(3, 3))
bow = cv.fit_transform(df['text'])
cv.vocabulary_

{'people watch campusx': 2,
 'campusx watch campusx': 0,
 'people write comment': 3,
 'campusx write comment': 1}

In [149]:
# trigram
try:
    cv = CountVectorizer(ngram_range=(4, 4))
    bow = cv.fit_transform(df['text'])
    cv.vocabulary_
except ValueError as e:
    print("ValueError:", e)

# Above code gives error because each centence doesn't have words >= 4

ValueError: empty vocabulary; perhaps the documents only contain stop words


In [150]:
cv = CountVectorizer(ngram_range=(1, 3))
bow = cv.fit_transform(df['text'])
cv.vocabulary_

{'people': 6,
 'watch': 11,
 'campusx': 0,
 'people watch': 7,
 'watch campusx': 12,
 'people watch campusx': 8,
 'campusx watch': 1,
 'campusx watch campusx': 2,
 'write': 13,
 'comment': 5,
 'people write': 9,
 'write comment': 14,
 'people write comment': 10,
 'campusx write': 3,
 'campusx write comment': 4}

# Tf-Idf

In [151]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [152]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [153]:
tfidf = TfidfVectorizer()
encoded = tfidf.fit_transform(df["text"])

In [154]:
encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11 stored elements and shape (4, 5)>

In [155]:
encoded.toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [156]:
pd.DataFrame(
    encoded.toarray(),
    columns = tfidf.get_feature_names_out()
)

Unnamed: 0,campusx,comment,people,watch,write
0,0.496816,0.0,0.613667,0.613667,0.0
1,0.850816,0.0,0.0,0.525464,0.0
2,0.0,0.57735,0.57735,0.0,0.57735
3,0.496816,0.613667,0.0,0.0,0.613667


In [157]:
tfidf.vocabulary_

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}

In [158]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campusx' 'comment' 'people' 'watch' 'write']
