### Complete code implementation of Vector representation
Topics will be covered in this notebook: <br>
<ul>
<li>Tokenization</li>
<li>Bag of word</li>
<li>TF/IDF</li>
<li>Embedding layer</li>
<li>Stemming/lemmatization</li>
<li>Stopwords handling</li>
</ul>

In [4]:
import numpy as np
import pandas as pd
import nltk
import tensorflow
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.utils import plot_model
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# VECTOR REPRESENTATION USING SKLEARN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import string
from textblob import TextBlob


In [8]:
## READING DATA FROM DIRECTORY

print("No. of lines in train data : ",len(open('Data/train.txt','r').readlines()))
print("No. of lines in test data : ",len(open('Data/test.txt','r').readlines()))
print("No. of lines in validation data : ",len(open('Data/val.txt','r').readlines()))


No. of lines in train data :  16000
No. of lines in test data :  2000
No. of lines in validation data :  2000


In [9]:
## READING DATA FROM DIRECTORY

train = open('data/train.txt','r').readlines()
test = open('data/test.txt','r').readlines()
val = open('data/val.txt','r').readlines()

In [10]:
## CONCATENATING ALL THE DATASETS

full_data = train + test + val
len(full_data)

20000

In [11]:
## DEFINING X AND Y VARIABLES, (INDEPENDENT AND DEPENDENT)

x = []
y = []
for item in full_data:
    text,label = item.split(';')
    label = label.replace('\n','')
    x.append(text)
    y.append(label)

In [12]:
print("length of X and Y var :",len(x),len(y))

length of X and Y var : 20000 20000


In [16]:
## RAW TEXT 
x[0:10]

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
 'i am feeling grouchy',
 'ive been feeling a little burdened lately wasnt sure why that was',
 'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
 'i feel as confused about life as a teenager or as jaded as a year old man',
 'i have been with petronas for years i feel that petronas has performed well and made a huge profit',
 'i feel romantic too']

## TEXT-CLEANING

In [17]:
# first we will do lowerise all the text
# second we will perform word tokenization
# third we will remove stopwords
# fourth we will perform stemming or lemmataization on each word
# then make a clean text 
stem = PorterStemmer()

def text_cleaning(sentences):
    clean_text = []
    for sent in sentences:
        lower_sent = sent.lower()   # first lowerising the sentence
        word_tokenize = nltk.word_tokenize(sent)   #performing word_tokenization
        removed_stop_words = [word for word in word_tokenize if word not in stopwords.words('english')]  # remove stopwords
        stemmed_sent = [stem.stem(word) for word in removed_stop_words]  # apply stemming
        cleaned = " ".join(stemmed_sent)  # joining our final words
        clean_text.append(cleaned)      # appending cleaned text in a separate list
    return clean_text


CLEAN_TEXT = text_cleaning(x)
CLEAN_TEXT[0:10]


['didnt feel humili',
 'go feel hopeless damn hope around someon care awak',
 'im grab minut post feel greedi wrong',
 'ever feel nostalg fireplac know still properti',
 'feel grouchi',
 'ive feel littl burden late wasnt sure',
 'ive take milligram time recommend amount ive fallen asleep lot faster also feel like funni',
 'feel confus life teenag jade year old man',
 'petrona year feel petrona perform well made huge profit',
 'feel romant']

##   TOKENIZATION   


In [21]:

## creating the object Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')
# we can also pass num_word parameter

#we are fitting tokenizer on our cleaned dataset
tokenizer.fit_on_texts(CLEAN_TEXT)


In [73]:
## OCCURENCES OF WORD IN CORPUS
# tokenizer.word_counts


In [24]:
## TOTAL NO. OF DOCUMENTS,
tokenizer.document_count


20000

In [26]:
## TOKENIZATION, CONVERT THE TEXT INTO VECTOR 
sequences = tokenizer.texts_to_sequences(CLEAN_TEXT)
sequences[0:5]

# AS YOU CAN SEE, ALL ARE WORDS GOT IT UNIQUE NUMBER,
# BUT THIS INPUT HAS NO EQUAL LENGTH,

[[61, 2, 522],
 [10, 2, 419, 682, 67, 50, 60, 96, 1229],
 [4, 1230, 431, 107, 2, 432, 192],
 [92, 2, 592, 3696, 7, 21, 2844],
 [2, 918]]

In [34]:
maximum_sentence_length = max(list(map(len,CLEAN_TEXT)))
maximum_sentence_length

208

In [27]:
## TO EQUALIZE THE VECTOR LENGTH, USING PAD SEQUENCES
## SO WE ARE ADDING 0,0.. IN ENDING OF THE VECTOR TO EQUALIZE THE LENGTH
## we are using maxlen=35, for an example because actually, its to high -> 208

sequences = pad_sequences(sequences,padding='post',maxlen=35)
sequences[0:5]


array([[  61,    2,  522,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  10,    2,  419,  682,   67,   50,   60,   96, 1229,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   4, 1230,  431,  107,    2,  432,  192,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  92,    2,  592, 3696,    7,   21, 2844,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   2,  918,    0,   

## VECTOR REPRESENTATION USING SKLEARN
CountVectorizer

In [37]:
CLEAN_TEXT[0:5]

['didnt feel humili',
 'go feel hopeless damn hope around someon care awak',
 'im grab minut post feel greedi wrong',
 'ever feel nostalg fireplac know still properti',
 'feel grouchi']

In [43]:
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()
# HYPERPARAMETERS FOR CountVectorizer
# stop_words='english'
# max_features=1000
# ngram_range=(1, 2)

# Fit and transform the corpus to create a document-term matrix
BOW = vectorizer.fit_transform(CLEAN_TEXT)
BOW 
## we got a sparse matrix

<20000x11578 sparse matrix of type '<class 'numpy.int64'>'
	with 179160 stored elements in Compressed Sparse Row format>

In [52]:
# Get the feature names (words)

feature_names = vectorizer.get_feature_names_out()

# # Convert the document-term matrix to a dense array for better visualization
dense_array = BOW.toarray()
## To GET DENSE ARRAY
dense_array.shape


(20000, 11578)

In [50]:
# AS YOU CAN SEE, CONVERTED RAW TEXT INTO NUMERICAL REPRESENTATION
dense_array[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Dense Matrix:
In a dense matrix, most of the elements are nonzero.
All elements, including zero values, are stored in memory.

Sparse Matrix:
In a sparse matrix, the majority of elements are zero.
Only nonzero elements are stored along with their indices.

## TF\IDF

In [59]:
## TFIDF VECTOR REPRESENTATION USING SKLEARN
tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit_transform(CLEAN_TEXT).toarray()
tfidf_vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## EMBADDING LAYER - (word embedding)
* word2vec
* glove

In [61]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [69]:
## THIS IS OUR PREVIOUS CLEAN TEXT 
CLEAN_TEXT[0:5]

['didnt feel humili',
 'go feel hopeless damn hope around someon care awak',
 'im grab minut post feel greedi wrong',
 'ever feel nostalg fireplac know still properti',
 'feel grouchi']

In [70]:
from tensorflow.keras.preprocessing.text import one_hot

In [72]:
## one hot representation with vocab_size=10000, i defined manually
## but vocab_size = len(tokenizer.word_index) <- means unique words

vocab_size = 10000
one_hot_repre = [one_hot(sent,vocab_size) for sent in CLEAN_TEXT]
one_hot_repre[0:5]


[[1689, 150, 9845],
 [6801, 150, 5832, 4965, 1011, 933, 5519, 633, 5800],
 [6074, 47, 1381, 8051, 150, 5654, 9577],
 [537, 150, 7461, 7831, 727, 9389, 6445],
 [150, 4621]]

In [75]:
## PAD_SEQUENCE

sequences = pad_sequences(one_hot_repre,padding='pre',maxlen=35)
# maxlen  --> maximum sequence input length
sequences[0:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1689,
         150, 9845],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0, 6801,  150, 5832, 4965, 1011,  933, 5519,
         633, 5800]])

In [76]:
sequences.shape

(20000, 35)

In [78]:
# Define the maximum number of unique words in your vocabulary

# Create a Sequential model with an Embedding layer
model = Sequential()
# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=20, input_length=35))
# Display the model summary

model.compile(loss='mse',optimizer='adam')
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 35, 20)            200000    
                                                                 
Total params: 200000 (781.25 KB)
Trainable params: 200000 (781.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [81]:
embeddings = model.predict(sequences)
embeddings.shape



(20000, 35, 20)

In [83]:
print("shape of our previous data : ",sequences.shape)
print("shape of Now : ",embeddings.shape)

shape of our previous data :  (20000, 35)
shape of Now :  (20000, 35, 20)


In [87]:
embeddings[0].shape


(35, 20)

In [88]:
embeddings[1]
## AS YOU CAN SEE OUR 1ST SENTENCE GOT REPRESENTED IN NUMBER

array([[-0.04378361,  0.03708205,  0.02849576,  0.00590458,  0.04794595,
        -0.00033931, -0.04129771, -0.02708526,  0.04577501,  0.0384303 ,
        -0.0343819 , -0.02140981,  0.0269854 ,  0.0224081 ,  0.01456524,
        -0.00935118,  0.02094496, -0.01622526,  0.0063672 , -0.03069818],
       [-0.04378361,  0.03708205,  0.02849576,  0.00590458,  0.04794595,
        -0.00033931, -0.04129771, -0.02708526,  0.04577501,  0.0384303 ,
        -0.0343819 , -0.02140981,  0.0269854 ,  0.0224081 ,  0.01456524,
        -0.00935118,  0.02094496, -0.01622526,  0.0063672 , -0.03069818],
       [-0.04378361,  0.03708205,  0.02849576,  0.00590458,  0.04794595,
        -0.00033931, -0.04129771, -0.02708526,  0.04577501,  0.0384303 ,
        -0.0343819 , -0.02140981,  0.0269854 ,  0.0224081 ,  0.01456524,
        -0.00935118,  0.02094496, -0.01622526,  0.0063672 , -0.03069818],
       [-0.04378361,  0.03708205,  0.02849576,  0.00590458,  0.04794595,
        -0.00033931, -0.04129771, -0.02708526,  