In [11]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the train and test data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Define the mapping of labels to integers
label_to_int = {
    0: "Sci/Tech",
    1: "Sports",
    2: "Business",
    3: "World",
    4: "Politics",
    5: "ESG",
    6: "Health",
    7: "Entertainment"
}

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df["text"])

# Convert the text data to sequences
X = tokenizer.texts_to_sequences(train_df["text"])
X = pad_sequences(X, maxlen=500)

# Convert the labels to one-hot encoded vectors
Y = to_categorical(train_df["label"], num_classes=len(label_to_int))

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=500))
model.add(Conv1D(filters=64, kernel_size=5, activation="relu"))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(len(label_to_int), activation="softmax"))

# Compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=64)

# Tokenize the test data
X_test = tokenizer.texts_to_sequences(test_df["text"])
X_test = pad_sequences(X_test, maxlen=500)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Convert the predicted labels to integer format
y_pred_int = np.argmax(y_pred, axis=1)

# Convert the integer labels to their corresponding text labels
pred_labels = [label_to_int[i] for i in y_pred_int]

# Write the predicted labels to the submission file
submission_df = pd.DataFrame({"id": test_df["id"], "label": pred_labels})
submission_df.to_csv("submission.csv", index=False)

# Evaluate the model on the validation set using macro f1 score
y_val_pred = model.predict(X_val)
y_val_pred_int = np.argmax(y_val_pred, axis=1)
y_val_true_int = np.argmax(Y_val, axis=1)
val_f1_score = f1_score(y_val_true_int, y_val_pred_int, average="macro")
print("Validation F1 Score: ", val_f1_score)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation F1 Score:  0.7935550842742856
