In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier

In [2]:
import contractions
import spacy
from nltk.corpus import stopwords

In [3]:
train_data = pd.read_csv("banking_data/train.csv")
train_data['dataset'] = "train"
test_data = pd.read_csv("banking_data/test.csv")
test_data['dataset'] = "test"

In [4]:
data_df = pd.concat([train_data, test_data])
data_df.shape

(13083, 3)

In [5]:
data_df.columns

Index(['text', 'category', 'dataset'], dtype='object')

In [6]:
## Cleaning the Text data
#transform into the lower case
data_df['text'] =  data_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#expanding contractions using contractions library
data_df['text']=  data_df['text'].apply(lambda x: " ".join(contractions.fix(x) for x in x.split()))

#removing puncuation symbols
data_df['text'] = data_df['text'].str.replace('[^\w\s]','',regex=True)

#removing stop words
stop = stopwords.words('english')
data_df['text'] = data_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data_df.head()

In [7]:
data_df['text_old'] = data_df['text']

In [8]:
nlp = spacy.load("en_core_web_sm")

def pos_tag(comment):
    doc = nlp(comment)
    return " ".join([token.text for token in doc if token.pos_ in  ("VERB","NOUN","PRONOUN","ADJ","ADVERB")])

#Lemmitization function
def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])

#Text data

## pos tag
data_df['text'] = data_df['text'].apply(pos_tag)

## lemmitization
data_df['text']= data_df['text'].apply(space)
data_df.head()

In [9]:
train_df = data_df[data_df.dataset=="train"].copy()
test_df = data_df[data_df.dataset=="test"].copy()

x_train = train_df.text.values
y_train = train_df.category.values

x_test = test_df.text.values
y_test = test_df.category.values

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

X_train = tokenizer.texts_to_sequences(x_train)
X_test = tokenizer.texts_to_sequences(x_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

In [11]:
lens_train = [len(i) for i in X_train]
lens_test = [len(i) for i in X_test]
lens = lens_train + lens_test

maxlen = np.max(lens)

print('Max len:', maxlen)

Max len: 79


In [12]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [13]:
X_train.shape

(10003, 79)

In [14]:
X_test.shape

(3080, 79)

In [15]:
encoder = LabelEncoder()
encoder.fit(data_df.category)
encoded_Y_test = encoder.transform(y_test)
encoded_Y_train = encoder.transform(y_train)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = np_utils.to_categorical(encoded_Y_test)
dummy_y_train = np_utils.to_categorical(encoded_Y_train)

In [16]:
encoded_Y_test

array([12, 12, 12, ..., 25, 25, 25])

In [17]:
## check

print(list(encoder.classes_)[12])
print(list(dummy_y_test[0]).index(1))
print(encoded_Y_test[0])
print(y_test[0])

card_arrival
12
12
card_arrival


In [18]:
l = len(list(encoder.classes_))
l

77

In [19]:
## to regenerate results, a must must
np.random.seed(3)
from tensorflow.python.framework.random_seed import set_random_seed
set_random_seed(3)

In [None]:
# I -- 34
# am -- 345
# sleep -- 456
# ing -- 4567

In [20]:
embedding_dim = 100  # vector representation of words/neuron amount after the input https://datascience.stackexchange.com/questions/53995/what-does-embedding-mean-in-machine-learning#:~:text=In%20the%20context%20of%20machine,with%20other%20models%20as%20well.

model = Sequential()  # initiates the model
model.add(layers.Embedding(input_dim=vocab_size,  # adds the first [input] layer which will be our tokenized tweets
                          output_dim=embedding_dim,  # the embedding of that tweet, essentially inputs output
                          input_length=maxlen))  # size of the input layer determined by maxlen calculated before
model.add(layers.Dropout(0.2))  # dorpouts are added to help with overtraining, essentially "turns off" said amount of neurons before giving information to the next layer
model.add(layers.GlobalMaxPool1D())  # https://computersciencewiki.org/index.php/Max-pooling_/_Pooling
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation='relu'))  # additional hidden layer
model.add(layers.Dropout(0.2))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(l, activation='softmax'))  # prediction layer, 2 is the number of classes we have
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 79, 100)           239600    
_________________________________________________________________
dropout (Dropout)            (None, 79, 100)           0         
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2

In [21]:
history = model.fit(X_train, dummy_y_train,
                    epochs=3,  # times model will run through the data
                    verbose=True,
                    validation_data=(X_test, dummy_y_test),
                    batch_size=128,
                   )  # data is set to batches we are sent to the model to predict, imagine each batc as a step in which model tries to predict the class and then checks the right answer and corrects it's weights with backpropogation
loss, accuracy = model.evaluate(X_train, dummy_y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, dummy_y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Training Accuracy: 0.4140
Testing Accuracy:  0.3429


In [22]:
# y_pred = model.predict(X_test)
# y_pred

In [23]:
# y_predDecoded = [encoder.classes_[np.argmax(i)] for i in y_pred]  # here we get the max probability from those arrays and then based on that select which class is it.
# cm = confusion_matrix(y_test, y_predDecoded, labels=data_df.category.unique())  # same confusion matrix code as in Logistic Regression
# df_cm = pd.DataFrame(cm, index=data_df.category.unique(), columns=data_df.category.unique())
# df_cm

In [24]:
# df_cm_percentage = df_cm.copy()
# for i in df_cm_percentage:
#     df_cm_percentage[i]/=df_cm_percentage[i].sum()

# df_cm_percentage

In [25]:
without text preprocessing:
accuracy train: 41%
accuracy test: 34%

with text processing except lammitization
accuracy train: 49%
accuracy test: 43%

with text processing with lammitization --- BEST
accuracy train: 54%
accuracy test: 49%

with text processing with lammitization and POS tagging
accuracy train: 52%
accuracy test: 47%