In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords

%matplotlib inline

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')

In [4]:
data_df.head()

Unnamed: 0,questions,a,b
0,What are the special things we (husband and me...,TTD,TTDSIG
1,What are the companies which organize shark fe...,TTD,TTDOTH
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA
3,What are the best places around Cape Town for ...,TTD,TTDSIG
4,What are the best places to stay for a family ...,ACM,ACMOTH


# Text Preprocessing

In [5]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
  word_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  return " ".join(filtered_sentence)

In [6]:
# Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()
# Stop word removal
data_df['sw_removed_questions'] = data_df.processed_questions.apply(remove_stopwords)

In [7]:
# Remove all the special characters
data_df['processed_a'] = data_df['a'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_a'] = data_df['a'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_a'] = data_df['a'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_a'] = data_df['a'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_a'] = data_df['a'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_a'] = data_df['a'].str.strip()

## Lemmatizing


In [8]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [9]:
data_df["question_lemmatized_sw"] = data_df.sw_removed_questions.apply(lemmatize_text)

In [10]:
data_df.head()

Unnamed: 0,questions,a,b,processed_questions,sw_removed_questions,processed_a,question_lemmatized_sw
0,What are the special things we (husband and me...,TTD,TTDSIG,What are the special things we (husband and me...,What special things ( husband ) 5 day stay Cap...,TTD,What special things ( husband ) 5 day stay Cap...
1,What are the companies which organize shark fe...,TTD,TTDOTH,What are the companies which organize shark fe...,What companies organize shark feeding events s...,TTD,What company organize shark feed events scuba ...
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,Is it safe for female traveller to go alone to...,Is safe female traveller go alone Cape Town ?,TGU,Is safe female traveller go alone Cape Town ?
3,What are the best places around Cape Town for ...,TTD,TTDSIG,What are the best places around Cape Town for ...,What best places around Cape Town safari ?,TTD,What best place around Cape Town safari ?
4,What are the best places to stay for a family ...,ACM,ACMOTH,What are the best places to stay for a family ...,What best places stay family stay away nightli...,ACM,What best place stay family stay away nightlife ?


In [75]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def print_report(y_test, y_pred):
    # Classification Report
    print(classification_report(y_test,y_pred))

    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy score: %.3f' % acc)

    f1_sc =  f1_score(y_test, y_pred, average='weighted')
    print('F1 Score: %.3f' % f1_sc)
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confustion matrix: \n{}".format(cm))
    return f1_sc


# Data encoding

In [52]:
Y = data_df.processed_a.values
le = LabelEncoder()
Y = le.fit_transform(Y)
Y

array([5, 5, 3, ..., 0, 5, 4])

In [26]:
max_words = 5000
max_len = 25
tok = Tokenizer(num_words=max_words, split=' ')
tok.fit_on_texts(data_df.processed_questions.values)
sequences = tok.texts_to_sequences(data_df.processed_questions.values)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [27]:
sequences_matrix.shape

(5000, 25)

In [31]:
print(sequences_matrix[0])
print(Y[0])

[   0    0    0    0    0    0    4    7    2  321  105   31 1837   17
   68    9   20   71    6  194   48   32   22  376  111]
5


# Model

In [65]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import SpatialDropout1D
def LSTM_MODEL():
    model = Sequential()
    model.add(Embedding(5000, 160, input_length=sequences_matrix.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(7, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [64]:
y_nn = pd.get_dummies(data_df['processed_a']).values
y_nn.shape

(5000, 7)

In [76]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(sequences):
    print('***********************************************************')
    fold += 1
    X_train, X_test = sequences_matrix[train_index], sequences_matrix[test_index]
    y_train, y_test = y_nn[train_index], y_nn[test_index]
    print("Beginning fold: ", fold)

    model = LSTM_MODEL()
    model.fit(X_train, y_train, epochs=10, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    
    predictions = model.predict(X_test)

    fine_pred = [np.argmax(p) for p in predictions]
    fine_gt = [np.argmax(p) for p in y_test]
    f1 = print_report(fine_pred, fine_gt)
    accuracies.append(f1)

print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

***********************************************************
Beginning fold:  1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        81
           1       0.68      0.53      0.60        32
           2       0.75      0.91      0.82        46
           3       0.86      0.72      0.78       127
           4       0.92      0.86      0.89       102
           5       0.70      0.84      0.76        99
           6       0.72      1.00      0.84        13

    accuracy                           0.80       500
   macro avg       0.78      0.81      0.79       500
weighted avg       0.81      0.80      0.80       500

Accuracy score: 80.400
F1 Score: 0.803
Confustion matrix: 
[[68  0  3  4  0  5  1]
 [ 1 17  3  1  0  9  1]
 [ 2  1 42  1  0  0  0]
 [ 8  1  4 91  6 16  1]
 [ 1  2  1  5 88  5  0]
 [ 1  4  3  4  2 83  2]
 [ 0  0  0  0  0  0 13]]
******************************

In [78]:
# max_words = 5000
# max_len = 25
# tok = Tokenizer(num_words=max_words, split=' ')
# tok.fit_on_texts(data_df.processed_questions.values)
# sequences = tok.texts_to_sequences(data_df.processed_questions.values)
# sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

tok1 = Tokenizer(num_words=max_words, split=' ')
tok1.fit_on_texts(data_df.question_lemmatized_sw.values)
sequences_lem_sw = tok1.texts_to_sequences(data_df.question_lemmatized_sw.values)
sequences_matrix_lem_sw = sequence.pad_sequences(sequences_lem_sw,maxlen=max_len)

In [79]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(sequences_lem_sw):
    print('***********************************************************')
    fold += 1
    X_train, X_test = sequences_matrix_lem_sw[train_index], sequences_matrix_lem_sw[test_index]
    y_train, y_test = y_nn[train_index], y_nn[test_index]
    print("Beginning fold: ", fold)

    model = LSTM_MODEL()
    model.fit(X_train, y_train, epochs=10, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    predictions = model.predict(X_test)
    fine_pred = [np.argmax(p) for p in predictions]
    fine_gt = [np.argmax(p) for p in y_test]
    f1 = print_report(fine_pred, fine_gt)
    accuracies.append(f1)

print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

***********************************************************
Beginning fold:  1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        83
           1       0.72      0.67      0.69        27
           2       0.84      0.94      0.89        50
           3       0.84      0.81      0.82       110
           4       0.94      0.81      0.87       111
           5       0.76      0.87      0.81       103
           6       0.78      0.88      0.82        16

    accuracy                           0.84       500
   macro avg       0.82      0.83      0.83       500
weighted avg       0.85      0.84      0.84       500

Accuracy score: 84.000
F1 Score: 0.840
Confustion matrix: 
[[72  0  2  5  0  4  0]
 [ 0 18  2  0  0  7  0]
 [ 1  2 47  0  0  0  0]
 [ 2  1  3 89  3 11  1]
 [ 3  3  1  8 90  5  1]
 [ 3  1  1  3  3 90  2]
 [ 0  0  0  1  0  1 14]]
********

# Summary

I was able to get average of 0.80 F1 score with unprocessed questions. The F1 score is increased by 1% after training the LSTM model with processed and lemmatized questions.  

