In [22]:
import pandas as pd
import csv
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [2]:
# Step 1: Read the CSV file as a single column of text.
df = pd.read_csv('train_spooky_author.csv', usecols=[0], header=None, names=['combined'])

In [3]:
# Define a function to extract id, text, and author using regular expression.
def extract_data(row):
    # This regular expression looks for a pattern with id followed by numbers,
    # then any text within quotes, and ends with an acronym within quotes
    match = re.match(r'(id\d+),"(.*?)","(\w+)"', row)
    if match:
        return match.groups()
    else:
        # Return None for rows that do not match the pattern
        return [None, None, None]


In [4]:
# Apply the function to each row
extracted_data = df['combined'].apply(extract_data)

In [5]:
# Create a new DataFrame from the extracted data
df_extracted = pd.DataFrame(extracted_data.tolist(), columns=['id', 'text', 'author'])

In [6]:
df_extracted.head()

Unnamed: 0,id,text,author
0,,,
1,,,
2,id17569,It never once occurred to me that the fumbling...,HPL
3,id11008,"In his left hand was a gold snuff box, from wh...",EAP
4,id27763,How lovely is spring As we looked from Windsor...,MWS


In [7]:

# Drop rows with None values which did not match the pattern
df_extracted.dropna(inplace=True)

In [8]:
df_extracted.head()

Unnamed: 0,id,text,author
2,id17569,It never once occurred to me that the fumbling...,HPL
3,id11008,"In his left hand was a gold snuff box, from wh...",EAP
4,id27763,How lovely is spring As we looked from Windsor...,MWS
6,id22965,"A youth passed in solitude, my best years spen...",MWS
8,id13515,The surcingle hung in ribands from my body.,EAP


In [9]:
# Reset the index of the DataFrame
df_extracted.reset_index(drop=True, inplace=True)

In [10]:
# Display the DataFrame to confirm the format
df_extracted.head()

Unnamed: 0,id,text,author
0,id17569,It never once occurred to me that the fumbling...,HPL
1,id11008,"In his left hand was a gold snuff box, from wh...",EAP
2,id27763,How lovely is spring As we looked from Windsor...,MWS
3,id22965,"A youth passed in solitude, my best years spen...",MWS
4,id13515,The surcingle hung in ribands from my body.,EAP


In [11]:
print(df_extracted.shape)

(15442, 3)


In [12]:
# One hot-encoding
category = pd.get_dummies(df_extracted.author)
df_baru = pd.concat([df_extracted, category], axis=1)
df_baru = df_baru.drop(columns=['id','author'])

# Convert bolean values to integers 0 or 1
df_baru[['EAP', 'HPL', 'MWS']] = df_baru[['EAP', 'HPL', 'MWS']].astype(int)

In [13]:
df_baru.head()

Unnamed: 0,text,EAP,HPL,MWS
0,It never once occurred to me that the fumbling...,0,1,0
1,"In his left hand was a gold snuff box, from wh...",1,0,0
2,How lovely is spring As we looked from Windsor...,0,0,1
3,"A youth passed in solitude, my best years spen...",0,0,1
4,The surcingle hung in ribands from my body.,1,0,0


In [14]:
# convert to Numpy Array
text = df_baru['text'].values
label = df_baru[['EAP', 'HPL', 'MWS']].values

In [15]:
text

array(['It never once occurred to me that the fumbling might be a mere mistake.',
       'In his left hand was a gold snuff box, from which, as he capered down the hill, cutting all manner of fantastic steps, he took snuff incessantly with an air of the greatest possible self satisfaction.',
       'How lovely is spring As we looked from Windsor Terrace on the sixteen fertile counties spread beneath, speckled by happy cottages and wealthier towns, all looked as in former years, heart cheering and fair.',
       ...,
       'Mais il faut agir that is to say, a Frenchman never faints outright.',
       'For an item of news like this, it strikes us it was very coolly received.""',
       'He laid a gnarled claw on my shoulder, and it seemed to me that its shaking was not altogether that of mirth.'],
      dtype=object)

In [16]:
label

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [17]:
# Divide training data set
text_train , text_test, label_train, label_test = train_test_split(text, label, test_size=0.2)

In [18]:
# Changing every words into number using Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(text_train)
tokenizer.fit_on_texts(text_test)

Sequence_train = tokenizer.texts_to_sequences(text_train)
Sequence_test = tokenizer.texts_to_sequences(text_test)

padded_train = pad_sequences(Sequence_train)
padded_test = pad_sequences(Sequence_test)



In [19]:
print("padded_train", padded_train)

padded_train [[   0    0    0 ...  398   11  695]
 [   0    0    0 ... 3788    3 4002]
 [   0    0    0 ...    3 3788  723]
 ...
 [   0    0    0 ...  230   19  135]
 [   0    0    0 ...    5   38  452]
 [   0    0    0 ...   37 4094  183]]


In [36]:
# Embedding and LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.Dropout(0.5), # Added dropout
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5), # Added dropout
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5), # Added dropout
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) # Early stopping



In [37]:
# Training dataset
num_epochs = 30
history = model.fit(padded_train, label_train, epochs=num_epochs,
                    validation_data=(padded_test, label_test), 
                    callbacks=[callback], # Early stopping callback
                    verbose=2)

Epoch 1/30
387/387 - 46s - loss: 0.9502 - accuracy: 0.5426 - val_loss: 0.7434 - val_accuracy: 0.6957 - 46s/epoch - 119ms/step
Epoch 2/30
387/387 - 45s - loss: 0.6344 - accuracy: 0.7479 - val_loss: 0.6186 - val_accuracy: 0.7504 - 45s/epoch - 116ms/step
Epoch 3/30
387/387 - 46s - loss: 0.4941 - accuracy: 0.8102 - val_loss: 0.5076 - val_accuracy: 0.7905 - 46s/epoch - 119ms/step
Epoch 4/30
387/387 - 47s - loss: 0.4320 - accuracy: 0.8353 - val_loss: 0.5172 - val_accuracy: 0.7857 - 47s/epoch - 122ms/step
Epoch 5/30
387/387 - 45s - loss: 0.3788 - accuracy: 0.8588 - val_loss: 0.5399 - val_accuracy: 0.7863 - 45s/epoch - 116ms/step
Epoch 6/30
387/387 - 45s - loss: 0.3732 - accuracy: 0.8625 - val_loss: 0.5617 - val_accuracy: 0.7951 - 45s/epoch - 116ms/step
Epoch 7/30
387/387 - 45s - loss: 0.3253 - accuracy: 0.8796 - val_loss: 0.5446 - val_accuracy: 0.7918 - 45s/epoch - 116ms/step
Epoch 8/30
387/387 - 45s - loss: 0.3036 - accuracy: 0.8855 - val_loss: 0.5388 - val_accuracy: 0.7912 - 45s/epoch - 117