In [1]:
pip install pandas numpy scikit-learn matplotlib tensorflow streamlit

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder




In [25]:
df = pd.read_csv('train.csv', header=0, names=['class', 'title', 'description'])
df['text'] = df['title'] + ' ' + df['description']

In [26]:
print(df['class'].unique())

[3 4 2 1]


In [27]:
df['class'] = df['class'].astype(int)
df['class_idx'] = df['class'] - 1
print(df['class_idx'].unique())

[2 3 1 0]


In [29]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['class_idx'] = encoder.fit_transform(df['class'])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['class_idx'], test_size=0.2, random_state=42)

In [32]:
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [33]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [34]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [35]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [36]:
num_classes = len(set(y_train))

model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

In [37]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [38]:
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=128, validation_split=0.1)
model.save("agnews_lstm_model.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


In [40]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9030
