In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/avito-category-prediction/train.csv')
test = pd.read_csv('/kaggle/input/avito-category-prediction/test.csv')

train.loc[train.description.isna(), "description"] = ""
test.loc[test.description.isna(), "description"] = ""

train["info"] = train["title"] + " " + train["description"]
test["info"] = test["title"] + " " + test["description"]

train = train.drop(["title", "description"], axis=1)
test = test.drop(["title", "description"], axis=1)

In [None]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import SGDClassifier

In [None]:
def my_tokenizer(string: str) -> list:   
    words = word_tokenize(string.lower())
    
    trash = re.compile(r'[\W\d_]')
    sw_set = set(stopwords.words('russian'))
    words = [token for token in words
            if not (trash.search(token) or token in sw_set)]
    
    stemmer = SnowballStemmer("russian")
    return [*map(stemmer.stem, words)]

In [None]:
np.random.seed(12345)
batch_train = train.sample(500000)
batch_train.head()

In [None]:
tfidf_vec = TfidfVectorizer(tokenizer=my_tokenizer, max_df=0.9)
tfidf = tfidf_vec.fit_transform(batch_train["info"])
tfidf_test = tfidf_vec.transform(test["info"])

scaler_tfidf = MaxAbsScaler()
tfidf = scaler_tfidf.fit_transform(tfidf)
tfidf_test = scaler_tfidf.transform(tfidf_test)

In [None]:
sgd_tfidf = SGDClassifier(loss='modified_huber')
sgd_tfidf.fit(tfidf, batch_train["Category"])
prediction = sgd_tfidf.predict(tfidf_test)

In [None]:
answer = (pd.DataFrame(test["itemid"])).join(pd.DataFrame(prediction))
answer = answer.rename({"itemid": "Id", 0: "Category"}, axis='columns')
answer.to_csv("submission.csv", index=False)