<a href="https://colab.research.google.com/github/Pihhot/SpeechType_Advanced/blob/main/speech_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Inputs for NN

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical, pad_sequences

from sklearn.preprocessing import LabelEncoder
import numpy as np

from keras.layers import Dense, LSTM, Input, Dropout, Embedding, Concatenate, Input
from keras.models import Sequential, Model
from keras.optimizers import Adam

Tokenizer

In [None]:
maxWordsCount = 5000
tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(train_sentences.copy()+test_sentences.copy())
print(list(tokenizer.word_counts.items())[:10])

[('ok', 531), ('y', 17912), ('uh', 29156), ('d', 13702), ('defending', 11), ('your', 2760), ('life', 573), ('but', 11834), ('no', 1776), ('gee', 92)]


Preparing Data

In [None]:
x_train, y_train = train_sentences.copy(), train_classes.copy()
x_test, y_test = test_sentences.copy(), test_classes.copy()

# Prepare text information
max_text_len = 10

x_train, x_test = tokenizer.texts_to_sequences(x_train), tokenizer.texts_to_sequences(x_test)
x_train_pad, x_test_pad = pad_sequences(x_train, maxlen=max_text_len), pad_sequences(x_test, maxlen=max_text_len)

# Prepare classes
y_train, y_test = np.array(y_train), np.array(y_test)

l_encoder = LabelEncoder()
y_train, y_test = l_encoder.fit_transform(y_train), l_encoder.fit_transform(y_test)
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

#print(x_train_pad[:5])
#print(y_train)



Create Model

In [None]:
input_1 = Input(shape=(max_text_len,))
embedding_1 = Embedding(maxWordsCount, 100, input_length=max_text_len)(input_1)
lstm_1 = LSTM(64)(embedding_1)

input_2 = Input(shape=(max_text_len,))
embedding_2 = Embedding(maxWordsCount, 100, input_length=max_text_len)(input_1)
lstm_2 = LSTM(64)(embedding_2)

concat = Concatenate()([lstm_1, lstm_2])


#lstm = LSTM(32)(concat)
dense = Dense(43, activation='softmax')(concat)

model = Model(inputs=[input_1, input_2], outputs=dense)

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(0.0001))

Fit model

In [None]:
history = model.fit([x_train_pad, x_train_pad], y_train, batch_size=50, epochs=5, shuffle=True)

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: ignored


*   Download RAW data from git
*   Process RAW data
*   Split on train & test





In [None]:
import pandas as pd
import spacy
!pip install contractions
import contractions
from pathlib import Path
import json
import re
#####################################################################################
!wget https://raw.githubusercontent.com/Pihhot/SpeechType_Advanced/main/data.json

data_path = Path.cwd() / 'data.json'

with open(data_path) as f:
    data = json.load(f)
######################################################################################    

def sent_lemmas(doc) -> str:
    return ' '.join(t.lemma_ for t in doc)

def sent_deps(doc) -> str:
    return ' '.join(t.dep_ for t in doc)

def sent_poss(doc) -> str:
    return ' '.join(t.pos_ for t in doc)

def clear_sentence(text: str) -> str:
    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def expand_contractions(text: str) -> str:
    return ' '.join([contractions.fix(w) for w in text.split(' ')])

data_c = data.copy()

nlp = spacy.load("en_core_web_sm")


train_sentences_or, train_sentences_m, train_deps, train_pos, train_class = [], [], [], [], []
test_sentences_or, test_sentences_m, test_deps, test_pos, test_class = [], [], [], [], []


index = 0
for class_ in data.keys():
  for sentence in data[class_]:
    index += 1
    # Save original sentences
    sentence_or = sentence
    # Beatify sentence
    sentence = expand_contractions(sentence)
    sentence = clear_sentence(sentence)
    if len(sentence):
      # Sentence representations
      doc = nlp(sentence)
      deps_str = sent_deps(doc)
      poss_str = sent_poss(doc)
      lemmas_str = sent_lemmas(doc)
      # Add every fifth element to test
      if index % 7 == 0:
        train_sentences_or.append(sentence_or)
        train_sentences_m.append(lemmas_str)
        train_deps.append(deps_str)
        train_pos.append(poss_str)
        train_class.append(class_)
      else:
        test_sentences_or.append(sentence_or)
        test_sentences_m.append(lemmas_str)
        test_deps.append(deps_str)
        test_pos.append(poss_str)
        test_class.append(class_)



train_df_data = {'ORIGIN':train_sentences_or,
                 'MODIFIED':train_sentences_m,
                 'DEPS':train_deps,
                 'POSS':train_pos,
                 'TYPE':train_class}

test_df_data = {'ORIGIN':test_sentences_or,
                 'MODIFIED':test_sentences_m,
                 'DEPS':test_deps,
                 'POSS':test_pos,
                 'TYPE':test_class}

train_df = pd.DataFrame(train_df_data)
test_df = pd.DataFrame(test_df_data)

test_df.to_csv('test_data', index=False, sep=':')
train_df.to_csv('train_data', index=False, sep=':')

{'ORIGIN': ['okay , uh ,',
  'd- , defending your life .',
  'but , no',
  'gee ,',
  'women ?',
  'well give me a break , you know .',
  'is , uh , kind of fun .',
  'but , no ,',
  "i know that it 's a real problem .",
  "or what 's the specialty that you 're looking for ?",
  'and you pay the tax based on how much you polluted ?',
  'you know , how that goes .',
  'oh well ,',
  'the first thing ,',
  '-kay , dave .',
  'ok .',
  'let alone other countries ,',
  'colorado springs .',
  'okay , lee .',
  'set .',
  'there i go .',
  'but , no .',
  '-- um , especially ,',
  'well , you know the old sayings about fords , what they stand for .',
  'you know what i mean ?',
  "i 'm sorry ,",
  'real excited when i heard this .',
  "well , that 's the thing , you know ,",
  'okay , ron .',
  "i mean , i 'm not kidding you .",
  'all right , um ,',
  'well , okay .',
  "that 's life ,",
  'before they started mumblex ,',
  'okay jerry .',
  'i tell you what .',
  'way .',
  'but , uh , oh