In [1]:
from keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I, love my cat',
    'You love my dog!'
]


tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


In [2]:
from keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]


In [3]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]
test_sequence = tokenizer.texts_to_sequences(test_data)
padded_test = pad_sequences(test_sequence, maxlen=10)
print(test_sequence)
print(padded_test)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


In [4]:
!gdown --id 1dRzdJS7-cZS4S5CuUc32MZYSLJGkkxnp

import json 

with open('sarcasm.json','r') as f:
    data = json.load(f)

sentences = []
labels = []
urls = []

for row in data:
    sentences.append(row['headline'])
    labels.append(row['is_sarcastic'])
    urls.append(row['article_link'])

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences,padding='post')
print(len(word_index))
print(sequences[1])
print(padded[1])
print(padded.shape)

Downloading...
From: https://drive.google.com/uc?id=1dRzdJS7-cZS4S5CuUc32MZYSLJGkkxnp
To: D:\projects\Tensorflow Certification\sarcasm.json

0.00B [00:00, ?B/s]
524kB [00:01, 364kB/s]
1.05MB [00:01, 799kB/s]
1.57MB [00:02, 863kB/s]
2.10MB [00:02, 1.14MB/s]
2.62MB [00:02, 1.24MB/s]
3.15MB [00:02, 1.54MB/s]
3.67MB [00:03, 1.51MB/s]
4.19MB [00:03, 1.95MB/s]
4.72MB [00:03, 2.10MB/s]
5.24MB [00:03, 2.09MB/s]
5.64MB [00:03, 2.12MB/s]
5.64MB [00:03, 1.42MB/s]


29657
[4, 8435, 3338, 2746, 22, 2, 166, 8436, 416, 3112, 6, 258, 9, 1002]
[   4 8435 3338 2746   22    2  166 8436  416 3112    6  258    9 1002
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
(26709, 40)


In [9]:
!gdown --id 1rX10xeI3eUJmOLsc4pOPY6AnCLO8DxNj

import csv

Downloading...
From: https://drive.google.com/uc?id=1rX10xeI3eUJmOLsc4pOPY6AnCLO8DxNj
To: D:\projects\Tensorflow Certification\bbc-text.csv

0.00B [00:00, ?B/s]
524kB [00:00, 1.29MB/s]
1.05MB [00:00, 1.40MB/s]
1.57MB [00:01, 1.59MB/s]
2.10MB [00:01, 1.87MB/s]
2.62MB [00:01, 2.12MB/s]
3.15MB [00:01, 2.36MB/s]
3.67MB [00:01, 2.48MB/s]
4.19MB [00:01, 2.79MB/s]
4.72MB [00:02, 2.89MB/s]
5.06MB [00:02, 2.75MB/s]
5.06MB [00:02, 2.26MB/s]


In [10]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [27]:
sentences = []
labels = []
with open('bbc-text.csv','r') as f:
    data = csv.reader(f, delimiter = ',')
    next(data, None)
    for row in data:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token,' ')
            sentence = sentence.replace('  ', ' ')
        sentences.append(sentence)
        
print(len(sentences))


2225


In [28]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

print(len(word_index))
print(padded[0])
print(padded.shape)

29714
[  96  176 1158 ...    0    0    0]
(2225, 2442)


In [30]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index
label_sequences = label_tokenizer.texts_to_sequences(labels)
#print(label_sequences)
print(label_word_index)

{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}
