## ***Getting started with tokenizing words***

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
corpus = ["I love my @dog",
          "I love my cat",
          "You love my dog!"]

In [None]:
tokenizer = Tokenizer(num_words = 100)

In [None]:
tokenizer.fit_on_texts(corpus)

In [None]:
word_index = tokenizer.word_index

In [None]:
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


## ***Exploring how tokenized words are re-transformed back to sentences.***

In [None]:
sentences = tokenizer.texts_to_sequences(corpus)

In [None]:
print(sentences)

[[3, 1, 2, 4], [3, 1, 2, 5], [6, 1, 2, 4]]


## ***Now if we try to tokenize the unseen data let's see what outcome we get.***

In [None]:
test_corpus = ["my dog is really good",
               "i love my cat more than my dog"]
test_sentences = tokenizer.texts_to_sequences(test_corpus)

In [None]:
print(test_sentences)

[[2, 4], [3, 1, 2, 5, 2, 4]]


## ***Let us see how we can use the OOV token to fill the absence of the words that the tokenizer hasn't seen before.***

In [None]:
tokenizer_with_oov = Tokenizer(num_words = 100 , oov_token = "<OOV>")

In [None]:
tokenizer_with_oov.fit_on_texts(corpus)

In [None]:
print(tokenizer_with_oov.word_index)

{'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7}


In [None]:
test_sentences_with_oov = tokenizer_with_oov.texts_to_sequences(test_corpus)

In [None]:
print(test_sentences_with_oov)

[[3, 5, 1, 1, 1], [4, 2, 3, 6, 1, 1, 3, 5]]


## ***Now let us address the problem of having sentences of various lengths (unequal), which result in having lengths of sentence vectors with tokenized words.***

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
padded_sentences = pad_sequences(test_sentences_with_oov , padding = 'post')

In [None]:
print(padded_sentences)

[[3 5 1 1 1 0 0 0]
 [4 2 3 6 1 1 3 5]]


##  ***Let's try to implement the following learnt functions on a sample dataset.***

### ***Data Gathering***

In [None]:
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2023-04-15 16:40:51--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.16.128, 142.251.163.128, 142.251.167.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.16.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json.1’


2023-04-15 16:40:51 (205 MB/s) - ‘sarcasm.json.1’ saved [5643545/5643545]



In [None]:
import json

with open("./sarcasm.json" , 'r') as f:
    datastore = json.load(f)

In [None]:
datastore[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [None]:
sentences = []
article = []
label = []

In [None]:
for item in datastore:
    sentences.append(item['headline'])
    label.append(item['is_sarcastic'])
    article.append(item['article_link'])

### ***Text Preprocessing***

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(oov_token = "<OOV>")

In [None]:
tokenizer.fit_on_texts(sentences)

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
padded_sequences = pad_sequences(sequences , padding = 'post' , maxlen = 40)

In [None]:
padded_sequences[0]

array([  308, 15115,   679,  3337,  2298,    48,   382,  2576, 15116,
           6,  2577,  8434,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)

## ***Exploring the BBC Dataset.***

In [None]:
import csv

In [None]:
 with open("/content/BBC News Train.csv" , 'r') as csvfile:
     print(csvfile.readline())
     print(csvfile.readline())

ArticleId,Text,Category




In [None]:
sentences = []
labels = []

In [None]:
with open("/content/BBC News Train.csv" , 'r') as csvfile:
    reader = csv.reader(csvfile , delimiter = ',')
    next(reader)
    for item in reader:
        sentences.append(item[1])
        labels.append(item[2])

### ***Removing the stopwords.***

In [None]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
for i in range(len(sentences)):
    sentences[i].lower()
    temp = sentences[i].split(" ")
    res = []
    for word in temp:
        if word not in stopwords:
            res.append(word)
    sentences[i] = " ".join(res)

### ***Sentence and Label Tokenization***

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)

In [None]:
sentences = tokenizer.texts_to_sequences(sentences)

In [None]:
sentences = pad_sequences(sentences , padding  = 'post')

In [None]:
label_tokenizer = Tokenizer(oov_token = "<OOV>")
label_tokenizer.fit_on_texts(labels)

In [None]:
labels = label_tokenizer.texts_to_sequences(labels)