<a href="https://colab.research.google.com/github/Pihhot/SpeechType_Advanced/blob/main/speech_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
from pathlib import Path
import json
import re

Dowload data from github

In [48]:
!wget https://raw.githubusercontent.com/Pihhot/SpeechType_Advanced/main/data.json

data_path = Path.cwd() / 'data.json'

with open(data_path) as f:
    data = json.load(f)

--2022-11-24 18:04:27--  https://raw.githubusercontent.com/Pihhot/SpeechType_Advanced/main/data.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9211573 (8.8M) [text/plain]
Saving to: ‘data.json.1’


2022-11-24 18:04:28 (100 MB/s) - ‘data.json.1’ saved [9211573/9211573]



Cleaning Data

In [49]:
def clear_sentence(text: str) -> str:
    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

data_copy = data.copy()
train_sentences, train_classes = [], []
test_sentences, test_classes = [], []
skipped_s = 0
index = 0
for k in data_copy.keys():
  for sentence in data_copy[k]:
    index += 1
    # Clean sentence
    sentence = clear_sentence(sentence)
    if len(sentence):
      # Add every fifth element to test
      if index % 7 == 0:
        test_sentences.append(sentence)
        test_classes.append(k)
      else:
        train_sentences.append(sentence)
        train_classes.append(k)
    else:
      skipped_s += 1

print(f'Count of train_sentences: {len(train_sentences):<10} Count of test_sentences: {len(test_sentences)}')
print(f'Count of train_classes:   {len(train_classes):<10} Count of test_classes:   {len(test_classes):<10}')
print(f'Count of classes in train: {len(set(train_classes)):<9} Count of classes in test: {len(set(test_classes))}')
print(f'Skipped: {skipped_s}')    
print(test_sentences[:10])


Count of train_sentences: 119950     Count of test_sentences: 19993
Count of train_classes:   119950     Count of test_classes:   19993     
Count of classes in train: 43        Count of classes in test: 43
Skipped: 39
['cafeteria ', 'you know something ', 'but anyway ', 'piece for uh ', 'whatever ', 'no ', ' yeah ', 'what was the spoof on top gun ', 'all right ellen ', 'but oh boy ']


Inputs for NN

In [50]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical, pad_sequences

from sklearn.preprocessing import LabelEncoder
import numpy as np

from keras.layers import Dense, LSTM, Input, Dropout, Embedding, Concatenate, Input
from keras.models import Sequential, Model
from keras.optimizers import Adam

Tokenizer

In [51]:
maxWordsCount = 5000
tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(train_sentences.copy()+test_sentences.copy())
print(list(tokenizer.word_counts.items())[:10])

[('ok', 531), ('y', 17800), ('uh', 29156), ('d', 12465), ('defending', 11), ('your', 2760), ('life', 573), ('but', 11834), ('no', 1776), ('gee', 92)]


Preparing Data

In [52]:
x_train, y_train = train_sentences.copy(), train_classes.copy()
x_test, y_test = test_sentences.copy(), test_classes.copy()

# Prepare text information
max_text_len = 10

x_train, x_test = tokenizer.texts_to_sequences(x_train), tokenizer.texts_to_sequences(x_test)
x_train_pad, x_test_pad = pad_sequences(x_train, maxlen=max_text_len), pad_sequences(x_test, maxlen=max_text_len)

# Prepare classes
y_train, y_test = np.array(y_train), np.array(y_test)

l_encoder = LabelEncoder()
y_train, y_test = l_encoder.fit_transform(y_train), l_encoder.fit_transform(y_test)
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

#print(x_train_pad[:5])
#print(y_train)



Create Model

In [59]:
"""input = Input()
emb = Embedding(maxWordsCount, 128, input_length = max_text_len)(input)
lstm1 = LSTM(64)(emb)

inp_2 = Embedding(maxWordsCount, 128, input_length = max_text_len)
lstm2 = LSTM(64)(inp_2)

concat_layer = Concatenate()([lstm1, lstm2])

dense = Dense(43, activation='softmax')(concat_layer)

model = Model(inputs=[inp_1, inp_2], outputs=dense)"""

#  embedding_16 (Embedding)    (None, 10, 128)           640000   

"""model = Sequential()
model.add(Embedding(maxWordsCount, 128, input_length = max_text_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(43, activation='softmax'))
model.summary()"""


input_1 = Input(shape=(max_text_len,))
embedding_1 = Embedding(maxWordsCount, 100, input_length=max_text_len)(input_1)

input_2 = Input(shape=(max_text_len,))
embedding_2 = Embedding(maxWordsCount, 100, input_length=max_text_len)(input_1)

concat = Concatenate()([embedding_1, embedding_2])


lstm = LSTM(128)(concat)
dense = Dense(43, activation='softmax')(lstm)

model = Model(inputs=[input_1, input_2], outputs=dense)

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(0.0001))

Fit model

In [62]:
history = model.fit([x_train_pad, x_train_pad], y_train, batch_size=50, epochs=5, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: ignored