### Apply Preprocessing to data and clean the data that we need for getting it ready for One-hot encoding, the following clean processing will be done on the data

-  Tokenize the data
-  Remove Stopwords
-  Stemming
-  Remove Punctuation and Special characters
-  Removing Numbers



In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#from nltk.tokenize import sent_tokenize

# Download necessary NLTK packages
nltk.download('punkt')  #parts-of-speech tagger library
nltk.download('stopwords')

# Load the corpus
with open('files/LabE6.txt','r') as file:
    corpus = file.read()

# Tokenize the corpus
tokens = word_tokenize(corpus)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Apply Stemming (we can also use Lemmatization)
stemmer = PorterStemmer()  #using PorterStemmer (other alternative SnowballStemmer, LancesterStemmer, RegexpStemmer)
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Removing Punctuations and Special Characters
filtered_tokens = [token for token in stemmed_tokens if token.isalnum()]

# Removing Numbers (we cannot do much with numbers in one-hot encoding, so removing token with only numbers)
filtered_tokens = [token for token in filtered_tokens if not token.isnumeric()]

# Output the preprocessed corpus
#print(filtered_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ranjit09\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ranjit09\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Create One-hot encoding vectors for the pre-processed Token generated above

In [2]:
import numpy as np

word_vectors = []
i = 0
for w in filtered_tokens:  
    t = np.zeros(len(filtered_tokens),dtype=np.int16)
    t[i] = 1
    word_vectors.append(t)
    i = i+1
i=0
dict = {}
for e in word_vectors:
    print(filtered_tokens[i], ' ----> ', e)
    i=i+1

one  ---->  [1 0 0 ... 0 0 0]
review  ---->  [0 1 0 ... 0 0 0]
mention  ---->  [0 0 1 ... 0 0 0]
watch  ---->  [0 0 0 ... 0 0 0]
oz  ---->  [0 0 0 ... 0 0 0]
episod  ---->  [0 0 0 ... 0 0 0]
hook  ---->  [0 0 0 ... 0 0 0]
right  ---->  [0 0 0 ... 0 0 0]
exactli  ---->  [0 0 0 ... 0 0 0]
happen  ---->  [0 0 0 ... 0 0 0]
br  ---->  [0 0 0 ... 0 0 0]
br  ---->  [0 0 0 ... 0 0 0]
first  ---->  [0 0 0 ... 0 0 0]
thing  ---->  [0 0 0 ... 0 0 0]
struck  ---->  [0 0 0 ... 0 0 0]
oz  ---->  [0 0 0 ... 0 0 0]
brutal  ---->  [0 0 0 ... 0 0 0]
unflinch  ---->  [0 0 0 ... 0 0 0]
scene  ---->  [0 0 0 ... 0 0 0]
violenc  ---->  [0 0 0 ... 0 0 0]
set  ---->  [0 0 0 ... 0 0 0]
right  ---->  [0 0 0 ... 0 0 0]
word  ---->  [0 0 0 ... 0 0 0]
go  ---->  [0 0 0 ... 0 0 0]
trust  ---->  [0 0 0 ... 0 0 0]
show  ---->  [0 0 0 ... 0 0 0]
faint  ---->  [0 0 0 ... 0 0 0]
heart  ---->  [0 0 0 ... 0 0 0]
timid  ---->  [0 0 0 ... 0 0 0]
show  ---->  [0 0 0 ... 0 0 0]
pull  ---->  [0 0 0 ... 0 0 0]
punch  ---->  [0 0

allow  ---->  [0 0 0 ... 0 0 0]
rest  ---->  [0 0 0 ... 0 0 0]
act  ---->  [0 0 0 ... 0 0 0]
hard  ---->  [0 0 0 ... 0 0 0]
judg  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
ridicul  ---->  [0 0 0 ... 0 0 0]
predict  ---->  [0 0 0 ... 0 0 0]
main  ---->  [0 0 0 ... 0 0 0]
charact  ---->  [0 0 0 ... 0 0 0]
total  ---->  [0 0 0 ... 0 0 0]
unsympathet  ---->  [0 0 0 ... 0 0 0]
therefor  ---->  [0 0 0 ... 0 0 0]
bore  ---->  [0 0 0 ... 0 0 0]
watch  ---->  [0 0 0 ... 0 0 0]
real  ---->  [0 0 0 ... 0 0 0]
emot  ---->  [0 0 0 ... 0 0 0]
depth  ---->  [0 0 0 ... 0 0 0]
stori  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
revolv  ---->  [0 0 0 ... 0 0 0]
actor  ---->  [0 0 0 ... 0 0 0]
ca  ---->  [0 0 0 ... 0 0 0]
get  ---->  [0 0 0 ... 0 0 0]
work  ---->  [0 0 0 ... 0 0 0]
feel  ---->  [0 0 0 ... 0 0 0]
origin  ---->  [0 0 0 ... 0 0 0]
develop  ---->  [0 0 0 ... 0 0 0]
cop  ---->  [0 0 0 ... 0 0 0]
feel  ---->  [0 0 0 ... 0 0 0]
like  ---->  [0 0 0 ... 0 0 0]
one  ----> 

thing  ---->  [0 0 0 ... 0 0 0]
left  ---->  [0 0 0 ... 0 0 0]
could  ---->  [0 0 0 ... 0 0 0]
offend  ---->  [0 0 0 ... 0 0 0]
peopl  ---->  [0 0 0 ... 0 0 0]
idea  ---->  [0 0 0 ... 0 0 0]
suicid  ---->  [0 0 0 ... 0 0 0]
begin  ---->  [0 0 0 ... 0 0 0]
anybodi  ---->  [0 0 0 ... 0 0 0]
need  ---->  [0 0 0 ... 0 0 0]
see  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
honestli  ---->  [0 0 0 ... 0 0 0]
portray  ---->  [0 0 0 ... 0 0 0]
suicid  ---->  [0 0 0 ... 0 0 0]
one  ---->  [0 0 0 ... 0 0 0]
better  ---->  [0 0 0 ... 0 0 0]
one  ---->  [0 0 0 ... 0 0 0]
like  ---->  [0 0 0 ... 0 0 0]
virgin  ---->  [0 0 0 ... 0 0 0]
suicid  ---->  [0 0 0 ... 0 0 0]
teenag  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
rate  ---->  [0 0 0 ... 0 0 0]
r  ---->  [0 0 0 ... 0 0 0]
pure  ---->  [0 0 0 ... 0 0 0]
suicid  ---->  [0 0 0 ... 0 0 0]
aspect  ---->  [0 0 0 ... 0 0 0]
littl  ---->  [0 0 0 ... 0 0 0]
chanc  ---->  [0 0 0 ... 0 0 0]
turn  ---->  [0 0 0 ... 0 0 0]
stori  ----

bride  ---->  [0 0 0 ... 0 0 0]
kill  ---->  [0 0 0 ... 0 0 0]
extract  ---->  [0 0 0 ... 0 0 0]
fluid  ---->  [0 0 0 ... 0 0 0]
bodi  ---->  [0 0 0 ... 0 0 0]
keep  ---->  [0 0 0 ... 0 0 0]
age  ---->  [0 0 0 ... 0 0 0]
wife  ---->  [0 0 0 ... 0 0 0]
look  ---->  [0 0 0 ... 0 0 0]
young  ---->  [0 0 0 ... 0 0 0]
report  ---->  [0 0 0 ... 0 0 0]
doctor  ---->  [0 0 0 ... 0 0 0]
stay  ---->  [0 0 0 ... 0 0 0]
night  ---->  [0 0 0 ... 0 0 0]
home  ---->  [0 0 0 ... 0 0 0]
discov  ---->  [0 0 0 ... 0 0 0]
respons  ---->  [0 0 0 ... 0 0 0]
bride  ---->  [0 0 0 ... 0 0 0]
death  ---->  [0 0 0 ... 0 0 0]
follow  ---->  [0 0 0 ... 0 0 0]
morn  ---->  [0 0 0 ... 0 0 0]
report  ---->  [0 0 0 ... 0 0 0]
murder  ---->  [0 0 0 ... 0 0 0]
polic  ---->  [0 0 0 ... 0 0 0]
mad  ---->  [0 0 0 ... 0 0 0]
scientist  ---->  [0 0 0 ... 0 0 0]
shot  ---->  [0 0 0 ... 0 0 0]
drop  ---->  [0 0 0 ... 0 0 0]
dead  ---->  [0 0 0 ... 0 0 0]
shortli  ---->  [0 0 0 ... 0 0 0]
br  ---->  [0 0 0 ... 0 0 0]
br  ----> 

lot  ---->  [0 0 0 ... 0 0 0]
better  ---->  [0 0 0 ... 0 0 0]
next  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
thank  ---->  [0 0 0 ... 0 0 0]
sure  ---->  [0 0 0 ... 0 0 0]
produc  ---->  [0 0 0 ... 0 0 0]
need  ---->  [0 0 0 ... 0 0 0]
trade  ---->  [0 0 0 ... 0 0 0]
name  ---->  [0 0 0 ... 0 0 0]
somewhat  ---->  [0 0 0 ... 0 0 0]
success  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
franchis  ---->  [0 0 0 ... 0 0 0]
titl  ---->  [0 0 0 ... 0 0 0]
suggest  ---->  [0 0 0 ... 0 0 0]
sequel  ---->  [0 0 0 ... 0 0 0]
first  ---->  [0 0 0 ... 0 0 0]
three  ---->  [0 0 0 ... 0 0 0]
movi  ---->  [0 0 0 ... 0 0 0]
even  ---->  [0 0 0 ... 0 0 0]
though  ---->  [0 0 0 ... 0 0 0]
marqu  ---->  [0 0 0 ... 0 0 0]
houston  ---->  [0 0 0 ... 0 0 0]
appear  ---->  [0 0 0 ... 0 0 0]
hp3  ---->  [0 0 0 ... 0 0 0]
play  ---->  [0 0 0 ... 0 0 0]
total  ---->  [0 0 0 ... 0 0 0]
differ  ---->  [0 0 0 ... 0 0 0]
charact  ---->  [0 0 0 ... 0 0 0]
eight  ---->  [0 0 0 ... 0 0 0]
ye

good  ---->  [0 0 0 ... 0 0 0]
focu  ---->  [0 0 0 ... 0 0 0]
move  ---->  [0 0 0 ... 0 0 0]
film  ---->  [0 0 0 ... 0 0 0]
clunki  ---->  [0 0 0 ... 0 0 0]
slow  ---->  [0 0 0 ... 0 0 0]
pace  ---->  [0 0 0 ... 0 0 0]
switch  ---->  [0 0 0 ... 0 0 0]
realiti  ---->  [0 0 0 ... 0 0 0]
actual  ---->  [0 0 0 ... 0 0 0]
happen  ---->  [0 0 0 ... 0 0 0]
book  ---->  [0 0 0 ... 0 0 0]
quickli  ---->  [0 0 0 ... 0 0 0]
got  ---->  [0 0 0 ... 0 0 0]
annoy  ---->  [0 0 0 ... 0 0 0]
actual  ---->  [0 0 0 ... 0 0 0]
book  ---->  [0 0 0 ... 0 0 0]
film  ---->  [0 0 0 ... 0 0 0]
titl  ---->  [0 0 0 ... 0 0 0]
number  ---->  [0 0 0 ... 0 0 0]
aw  ---->  [0 0 0 ... 0 0 0]
detect  ---->  [0 0 0 ... 0 0 0]
stori  ---->  [0 0 0 ... 0 0 0]
audienc  ---->  [0 0 0 ... 0 0 0]
get  ---->  [0 0 0 ... 0 0 0]
stuck  ---->  [0 0 0 ... 0 0 0]
listen  ---->  [0 0 0 ... 0 0 0]
carrey  ---->  [0 0 0 ... 0 0 0]
narrat  ---->  [0 0 0 ... 0 0 0]
bore  ---->  [0 0 0 ... 0 0 0]
tear  ---->  [0 0 0 ... 0 0 0]
carrey  ---

### Create a Bag of words by splitting the content first into 