In [5]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import nltk
#import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split

## Loading Intent DataSet

In [6]:
intent_dataset = pd.read_csv('snipsdataset.csv')
print(intent_dataset['intents'].unique())
print(len(intent_dataset['intents'].unique()))
lm_dataset = pd.DataFrame()
lm_dataset[['text']] = intent_dataset[['text']]

['AddToPlaylist' 'BookRestaurant' 'GetWeather' 'PlayMusic' 'RateBook'
 'SearchCreativeWork' 'SearchScreeningEvent']
7


In [7]:
intent_dataset.head()

Unnamed: 0,text,intents
0,Add another song to the Cita RomГЎntica playli...,AddToPlaylist
1,add clem burke in my playlist Pre-Party R&B Jams,AddToPlaylist
2,Add Live from Aragon Ballroom to Trapeo,AddToPlaylist
3,add Unite and Win to my night out,AddToPlaylist
4,Add track to my Digster Future Hits,AddToPlaylist


## Split DataSet in Training Set / Testing Set in 80% / 20% (can be changed)

Machine Learning is all about generalizing so that your inference is right on data which is unseen. 

If you don't split it, you would be training on the entire data available to you. 

Though you may now have inferences that are right on this data. 

But you don't know for sure, how good would your inference be on unseen data. 

In [8]:
intent_train_dataset, intent_test_dataset = train_test_split(intent_dataset, test_size=0.2, random_state = 1)


In [9]:
print("Test Dataset : " ,len(intent_test_dataset))
print("Train Dataset : ", len(intent_train_dataset))
print("Labels : ", len(intent_train_dataset.intents.unique()))

Test Dataset :  3177
Train Dataset :  12707
Labels :  7


# Custom DNN for Text Classification (with TFIDF embeddings)

Load Keras Library

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding
from sklearn.preprocessing import LabelBinarizer

2022-10-19 15:19:20.381225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-19 15:19:25.563026: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-19 15:19:25.563148: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-19 15:19:26.002546: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-19 15:19:37.867575: W tensorflow/stream_executor/platform/de

Check if Keras uses GPU

In [11]:
from keras import backend as K

Load Sentences from pandas

In [12]:
train_sentences = intent_train_dataset['text'].values
test_sentences = intent_test_dataset['text'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

Parameters for the model

In [13]:
num_labels = 7
vocab_size = 15000 
batch_size = 1000
top_words = 15000
embedding_vector_length = 300

Tokenize using TFIDF

In [14]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_matrix(train_sentences, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_sentences, mode='tfidf')

In [15]:
x_train[9].shape

(15000,)

Vector example

In [16]:
print(train_sentences[99])

find information about the album Flipper City


In [17]:
print(x_train[99][:100])

[0.         1.15686933 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         2.42236021 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         3.34796974 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


Encode the Labels

In [18]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

The neural network is created by stacking layers — this requires two main architectural decisions:

How many layers to use in the model?

How many hidden units to use for each layer?


If a model has more hidden units (a higher-dimensional representation space), and/or more layers, then the network can learn more complex representations. 

However, it makes the network more computationally expensive and may lead to learning unwanted patterns — patterns that improve performance on training data but not on the test data. 

In [20]:
model = Sequential()
model.add(Dense(50, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 50)                750050    
                                                                 
 activation_2 (Activation)   (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 7)                 357       
                                                                 
 activation_3 (Activation)   (None, 7)                 0         
                                                                 
Total params: 750,407
Trainable params: 750,407
Non-trainable params: 0
_________________________________________________________________


In [None]:
print(history.history.keys())
#  "Accuracy"
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])


KeyError: 'accuracy'

Test the model

In [18]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomRNN'] = Result


Test accuracy: 0.9723009103155752


# Custom CNN for Text Classification (with Learned embeddings)

Load Keras Library

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.layers import Activation, Concatenate, Dense, Conv2D, Reshape, MaxPool2D, Dropout, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, CuDNNLSTM, Input, Multiply, TimeDistributed, multiply, Flatten, RepeatVector, Permute, Lambda
from keras.optimizers import Adamax, Adam
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

Check if Keras uses GPU

In [21]:
from keras import backend as K
#K.tensorflow_backend._get_available_gpus()

Parameters for the model

In [22]:
num_labels = 7
vocab_size = 15000
batch_size = 1000
top_words = 15000
embedding_vector_length = 300
maxlen = 30

## for CNN
filter_sizes = [2,4,6]
num_filters = 20
drop = 0.5


Tokenize

In [23]:
#import snowballstemmer
#SnowballStemmer
import nltk
pd.options.mode.chained_assignment = None  # default='warn'

stemmer = SnowballStemmer
stop = nltk.corpus.stopwords.words('english')
#stop.extend(lowfreq)
toktok = nltk.tokenize.toktok.ToktokTokenizer()

intent_train_dataset['cleaned'] = intent_train_dataset['text']
intent_train_dataset['cleaned'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True)
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].str.lower()
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(toktok.tokenize)
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(lambda x: [word for word in x if word not in stop])
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(lambda x: " ".join(x))

intent_test_dataset['cleaned'] = intent_test_dataset['text']
intent_test_dataset['cleaned'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True)
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].str.lower()
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(toktok.tokenize)
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(lambda x: [word for word in x if word not in stop])
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(lambda x: " ".join(x))

Load Sentences from pandas

In [24]:
train_sentences = intent_train_dataset['cleaned'].values
test_sentences = intent_test_dataset['cleaned'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

In [25]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_sequences(train_sentences)
x_test = tokenizer.texts_to_sequences(test_sentences)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
x_train.shape

(12707, 30)

Encode the Labels

In [26]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

In [27]:
Inputs_W = Input(shape=(maxlen,))
Inputs_E = Embedding(vocab_size, embedding_vector_length, input_length=maxlen)(Inputs_W)
reshape = Reshape((maxlen,embedding_vector_length,1))(Inputs_E)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
Dense_Label = Dense(num_labels)(dropout)
Classifier = Activation('softmax')(Dense_Label)

model = Model(input=Inputs_W, output=Classifier)
model.summary()

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
    
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=25,
                    verbose=1,
                    validation_split=0.2)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      4500000     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 30, 300, 1)   0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 29, 1, 20)    12020       reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

Test the model

In [28]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomRNN'] = Result


Test accuracy: 0.9748190112897425


In [30]:
print ( intent_test_dataset["text"][:5])

print (Result[:5])

12710    Are any animated movies playing at Magic Johns...
1242     I would like you to add now the hits of winter...
2811     book a highly rated restaurant in Central Afri...
14723     I would like to hear something from Groove Shark
1826        Add Grey Cloudy Lies to the hip hop playlist. 
Name: text, dtype: object
['SearchScreeningEvent', 'AddToPlaylist', 'BookRestaurant', 'PlayMusic', 'AddToPlaylist']


Confusion Matrix

# Custom LSTM for Text Classification (with Learned Vector embeddings)

Load Keras Library

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, RNN
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelBinarizer

Check if Keras uses GPU

In [None]:
from keras import backend as K
#K.tensorflow_backend._get_available_gpus()

Load Sentences from pandas

In [None]:
train_sentences = intent_train_dataset['text'].values
test_sentences = intent_test_dataset['text'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

Parameters for the model

In [None]:
num_labels = 7
vocab_size = 20000
batch_size = 100
lstm_size = 256
maxlen = 50
embedding_vector_length = 256

Tokenize

In [None]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_sequences(train_sentences)
x_test = tokenizer.texts_to_sequences(test_sentences)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

Encode the Labels

In [None]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=maxlen))
model.add(Bidirectional(LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Test the model

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomLSTM'] = Result
