In [1]:
import string
import re
import pandas as pd
import numpy as np
from os import listdir
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D,SpatialDropout1D
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('hm_train.csv')
train.head()                    

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
# turn a doc into clean tokens
def clean_doc(doc):
    # coverting the string to lower case and removing characters other than alphabits using regex
    doc = doc.str.lower().str.replace("[^a-z]"," ")
    
    # filter out tokens in stop words
    stop_w = stopwords.words('english')
    tokens = doc.apply(lambda x: " ".join([w for w in x.split() if w  not in stop_w]))
    #tokens = tokens.apply(lambda x : ' '.join(x))
    return tokens

In [5]:
train['clean'] = clean_doc(train['cleaned_hm'])

In [6]:
train['clean']

0        went successful date someone felt sympathy con...
1                          happy son got marks examination
2                                    went gym morning yoga
3        serious talk friends flaky lately understood g...
4        went grandchildren butterfly display crohn con...
                               ...                        
60316    got together best friend baked cupcakes cookie...
60317                              went restaurant friends
60318               day mechanical turk made fifty dollars
60319    finished semester today aced majority tests aw...
60320    event made happy past months went meet man sel...
Name: clean, Length: 60321, dtype: object

In [7]:
train['clean']

0        went successful date someone felt sympathy con...
1                          happy son got marks examination
2                                    went gym morning yoga
3        serious talk friends flaky lately understood g...
4        went grandchildren butterfly display crohn con...
                               ...                        
60316    got together best friend baked cupcakes cookie...
60317                              went restaurant friends
60318               day mechanical turk made fifty dollars
60319    finished semester today aced majority tests aw...
60320    event made happy past months went meet man sel...
Name: clean, Length: 60321, dtype: object

In [8]:
train['cleaned_hm']

0        I went on a successful date with someone I fel...
1        I was happy when my son got 90% marks in his e...
2             I went to the gym this morning and did yoga.
3        We had a serious talk with some friends of our...
4        I went with grandchildren to butterfly display...
                               ...                        
60316    I got together with my best friend and baked c...
60317                  I went to a restaurant with friends
60318    The other day on Mechanical Turk I made over f...
60319    Finished the semester today and aced majority ...
60320    An event that made me happy in the past 3 mont...
Name: cleaned_hm, Length: 60321, dtype: object

In [9]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(lines)
    #tokenizer.texts_to_sequences(lines)
    return tokenizer


In [10]:
# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')# padding with 0's at end of the docs
    return padded


In [11]:
from sklearn.preprocessing import LabelEncoder
# defining label encoder
def label_encode(data):
    encode = LabelEncoder().fit(data)
    y= encode.transform(data)
    return y     

In [12]:
# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 150, input_length=max_length))
    model.add(SpatialDropout1D(0.2))
    #model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    #model.add(MaxPooling1D(pool_size=2))
    #model.add(Flatten())
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(7, activation='softmax'))
    # compile network
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [13]:
# create the tokenizer
tokenizer = create_tokenizer(train['clean'])
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train['clean']])
print('Maximum length: %d' % max_length)
# encode data
X = encode_docs(tokenizer, max_length, train['clean'])
y = label_encode(train['predicted_category'])
y = y.reshape(-1,1)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.2,random_state = 42)
# define model
print("shape of the tensor {0}".format(X.shape))
model = define_model(vocab_size, max_length)
# fit network
history = model.fit(X_train, Y_train, epochs=10, batch_size=500,validation_data = (X_test,Y_test))
#plotting accuracy and loss
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validate')
plt.legend()
plt.show();


Vocabulary size: 19857
Maximum length: 654
shape of the tensor (60321, 654)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 654, 150)          2978550   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 654, 150)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               100400    
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1020      
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 147       
Total params: 3,085,167
Trainable params: 3,08

KeyboardInterrupt: 

In [None]:
# save the model
#model.save('model_CNN_Text_classification.h5')

In [None]:
test = pd.read_csv("hm_test.csv")
test.head()

In [None]:
crtest['cleaned_hm']

In [None]:
#
y_probas = model.predict(X_test)
y_pred = np.argmax(y_probas, axis=1)
y_test = np.argmax(Y_test, axis=1)