In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# preprocess the data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # remove stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    # remove punctuation and other non-essential characters
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# pad the sequences
max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# convert labels to categorical
y_train = to_categorical(train_df['label'])

# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
epochs = 64
batch_size = 15
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

# evaluate the model
y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)
y_val = np.argmax(y_val, axis=1)
f1 = f1_score(y_val, y_pred, average='weighted')
print('Validation F1 score:', f1)

# make predictions on test data
y_test_pred = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)

# generate submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kkksk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/64

KeyboardInterrupt: 

In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# preprocess the data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # remove stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    # remove punctuation and other non-essential characters
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# tokenize the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# pad the sequences
max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# convert labels to categorical
y_train = to_categorical(train_df['label'])

# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
epochs = 64
batch_size = 50
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping, model_checkpoint])

# load the best model weights
best_model = load_model('best_model.h5')

# evaluate the model
y_pred = best_model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)
y_val = np.argmax(y_val, axis=1)
f1 = f1_score(y_val, y_pred, average='weighted')
print('Validation F1 score:', f1)

# make predictions on test data
y_test_pred = best_model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)

# generate submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kkksk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/64
Epoch 1: val_loss improved from inf to 0.94835, saving model to best_model.h5
Epoch 2/64
Epoch 2: val_loss improved from 0.94835 to 0.91247, saving model to best_model.h5
Epoch 3/64
Epoch 3: val_loss improved from 0.91247 to 0.74248, saving model to best_model.h5
Epoch 4/64
Epoch 4: val_loss improved from 0.74248 to 0.55965, saving model to best_model.h5
Epoch 5/64
Epoch 5: val_loss improved from 0.55965 to 0.53111, saving model to best_model.h5
Epoch 6/64
Epoch 6: val_loss did not improve from 0.53111
Epoch 7/64
Epoch 7: val_loss did not improve from 0.53111
Epoch 8/64
Epoch 8: val_loss did not improve from 0.53111
Epoch 8: early stopping
Validation F1 score: 0.8411818855677055


In [6]:
from tensorflow.keras.models import load_model

# generate submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)

# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# load the best model weights
best_model = load_model('best_model.h5')

# evaluate the model
y_pred = best_model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)
y_val = np.argmax(y_val, axis=1)
f1 = f1_score(y_val, y_pred, average='weighted')
print('Validation F1 score:', f1)

# make predictions on test data
y_test_pred = best_model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)

# generate submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)



Validation F1 score: 0.9343340052063166
