In [55]:
import tensorflow
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from keras.utils import pad_sequences


In [56]:
data = pd.read_csv('train.csv')
questions = data['problem'].tolist()
categories = data['category'].tolist()

In [57]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
sequences = tokenizer.texts_to_sequences(questions)

In [58]:
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, categories, test_size=0.2, random_state=42)



In [60]:
num_categories = len(set(categories))
y_train_encoded = to_categorical(y_train, num_classes=num_categories)

In [61]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_categories, activation='softmax'))

In [62]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train_encoded, epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1c0318d8b10>

In [64]:
test_data = pd.read_csv('test.csv')
test_questions = test_data['problem'].tolist()
test_sequences = tokenizer.texts_to_sequences(test_questions)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)
predictions = np.argmax(model.predict(test_padded_sequences), axis=-1)



In [66]:
num_test_questions = len(test_questions)
predictions_padded = np.pad(predictions, (0, num_test_questions - len(predictions)), mode='constant')


In [67]:
for question, category in zip(X_test, predictions):
    print(f"Question: {tokenizer.sequences_to_texts([question])[0]}\nCategory: {category}\n")


Question: determine the number of solutions for the following system of equations y 7x 4 y 1 3x 3 number of solutions no solution exactly one solution infinitely many solutions
Category: 17

Question: olivia went to a beauty store to buy p bottles of nail polish each bottle of nail polish costs 9 50 how much will she spend in total for the bottles of nail polish write an expression using p and 9 50
Category: 9

Question: solve the following system by substitution y 6 x y 3x
Category: 8

Question: write an equation to match the phrase 11 greater than n is 33
Category: 7

Question: solve the following system by elimination 6x 3y 15 4x 3y 13
Category: 2

Question: daisy bought 15 new plants for her garden she paid the full price of 4 75 for some of the plants some of the plants were on sale for 3 50 each all together daisy spent 60 write a system of equations to find the number of full priced plants daisy bought x and the number of sale priced plants daisy bought y
Category: 15

Question:

In [69]:
submission_df = pd.DataFrame({'Problem': test_questions, 'Category': predictions_padded})

In [70]:
submission_df.to_csv('submission.csv', index=False)