In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/math-problem-categorization/sample_submission.csv
/kaggle/input/math-problem-categorization/train.csv
/kaggle/input/math-problem-categorization/test.csv


In [2]:
dft = pd.read_csv('/kaggle/input/math-problem-categorization/train.csv')
dfte = pd.read_csv('/kaggle/input/math-problem-categorization/test.csv')

In [3]:
dft['category'].value_counts()

7     19
16    11
0     10
22    10
4     10
11    10
3     10
2     10
6     10
12    10
5     10
13    10
9     10
15    10
1     10
23    10
8     10
24    10
14    10
17     9
19     9
18     8
10     8
20     8
21     8
Name: category, dtype: int64

In [4]:
index_to_delete = dft[dft['category'] == 7].head(9).index
dft = dft.drop(index_to_delete)

In [5]:
dft.category.value_counts()

16    11
0     10
22    10
9     10
4     10
11    10
3     10
2     10
6     10
12    10
5     10
13    10
7     10
15    10
1     10
23    10
8     10
24    10
14    10
17     9
19     9
18     8
10     8
21     8
20     8
Name: category, dtype: int64

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in string.punctuation]
    filtered_tokens = [word for word in filtered_tokens if word.strip() != '']
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return stemmed_tokens

dft['stemmed_tokens'] = dft['problem'].apply(preprocess_text)
dfte['stemmed_tokens'] = dfte['problem'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(split=' ')
tokenizer.fit_on_texts(dft['stemmed_tokens'])
X_sequence = tokenizer.texts_to_sequences(dft['stemmed_tokens'])
y_sequence = tokenizer.texts_to_sequences(dfte['stemmed_tokens'])
y = dft['category']

X_train, X_test, y_train, y_test = train_test_split(X_sequence, y, test_size=0.2, random_state=0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

maxlen_values = list(range(50, 151, 10))
best_accuracy = 0
best_maxlen = None

for maxlen in maxlen_values:
   
    X_train_padded = pad_sequences(X_train, maxlen=maxlen)
    X_test_padded = pad_sequences(X_test, maxlen=maxlen)
    
    model = RandomForestClassifier(random_state=0)
    model.fit(X_train_padded, y_train)
    
    y_pred = model.predict(X_test_padded)
    accuracy = accuracy_score(y_pred, y_test)
   
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_maxlen = maxlen

print("Best maxlen: ", best_maxlen)
print("Best accuracy: ", best_accuracy)

Best maxlen:  130
Best accuracy:  0.8163265306122449


In [9]:
X_padded = pad_sequences(X_sequence, maxlen=best_maxlen)
test_paded = pad_sequences(y_sequence, maxlen=best_maxlen)
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_padded, y)

In [10]:
y_pred = model.predict(test_paded)
submission = pd.read_csv('/kaggle/input/math-problem-categorization/sample_submission.csv')
submission.category = y_pred
submission.to_csv('submission.csv', index=False)

In [11]:
len(submission.category.unique())

24

In [12]:
submission.category.value_counts()

3     21
14    11
23     9
5      8
2      7
16     7
17     6
12     6
10     6
1      6
11     5
8      5
21     5
24     4
22     4
7      3
19     3
18     2
9      2
20     1
6      1
15     1
4      1
0      1
Name: category, dtype: int64