In [27]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import re

import warnings
warnings.filterwarnings("ignore")

### Loading data

In [28]:
test_data = pd.read_csv("C://Users/н/Desktop/rosstat/test_dataset_test.csv")
train_data = pd.read_csv("C://Users/н/Desktop/rosstat/train_dataset_train.csv")

sample_solution = pd.read_csv("C://Users/н/Desktop/rosstat/sample_solution.csv")
#categories = pd.read_excel("C://Users/н/Desktop/rosstat/categories.xlsx")

In [29]:
train_data.drop('id', axis=1, inplace=True)

### Preparing training data

In [30]:
def contents_fat(text):
    fats = re.findall(r'([2]{1}[.,]*[5-9]{1}[%пмдж]{1})|([3]{1}[.,]*[0-4]{1}[%пмдж]{1})|([3]{1}[.,]*[5]{1}[^1-9]{1}[%пмдж]{0,1})', text)
    try:
        return 1 if len(set(fats[0]).difference({''})) > 0 else 0
    except: return 0

In [31]:
def prepare_text(text):
    text = str(text).lower()
    words = re.findall(r'[a-zа-яё]*', text)
    words.append(str('3.2%' * contents_fat(text)))
    return ' '.join([word for word in words if len(word) > 1])

In [32]:
train_df = pd.DataFrame()

In [33]:
train_df['preprocessed'] = train_data['name'].apply(lambda name: prepare_text(name))

### Dividing data into train and validation datasets

In [34]:
group = train_data['groups']
train_data.drop('groups', axis=1, inplace=True)

In [35]:
X_train, X_val, y_train, y_val = train_test_split(train_df, group, train_size=0.7, stratify=group)

### Creating & fitting TF-IDF vectorizer

In [36]:
vec = TfidfVectorizer(min_df=17)

In [37]:
X_train_tf = vec.fit_transform(X_train['preprocessed'])
X_val_tf = vec.transform(X_val['preprocessed'])

### Building classifiers

#### 1. LogisticRegression

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
clf = LogisticRegression(
    penalty='l2', 
    n_jobs=-1,
    class_weight='balanced',
    C=1.25
)

clf.fit(X_train_tf, y_train)

In [40]:
y_pred = clf.predict(X_val_tf)

In [41]:
#score = 0.922120993656991 (penalty='l2')
#score = 0.9620979569067207 (penalty='l2', n_jobs=-1, class_weight='balanced')
#score = 0.9642140616272239 (prev + C=1.25)

#### 2. CatBoostClassifier

In [127]:
from catboost import CatBoostClassifier

In [128]:
model = CatBoostClassifier(
    iterations=30,
    learning_rate=0.37,
)

In [None]:
model.fit(X_train_tf, y_train)

In [155]:
y_pred = model.predict(X_val_tf)

In [156]:
y_pred = [y_pred[i][0] for i in range(len(y_pred))]

In [157]:
# score = 0.8786602983964963 (iterations=30, learning_rate=0.35)

### Accuracy assessment

In [42]:
from sklearn.metrics import recall_score

In [43]:
score = recall_score(y_val, y_pred, average='macro')

In [44]:
score

0.9645154182428093

### Getting solution file

In [45]:
test_data = test_data.loc[:282226]

In [46]:
test_tf = vec.transform(test_data['name'].apply(lambda name: prepare_text(name)))
test_pred = pd.Series(clf.predict(test_tf), dtype='float64')

In [47]:
pd.DataFrame({'id': test_data['id'], 'groups': test_pred}).to_csv('test_solution.csv', index=False)

In [48]:
test_solution = pd.read_csv('C://Users/н/Desktop/rosstat/test_solution.csv')

In [50]:
sample_solution

Unnamed: 0,id,groups
0,2271768,0.0
1,5495560,0.0
2,4390924,0.0
3,1951592,0.0
4,1037434,0.0
...,...,...
282222,4745550,0.0
282223,339082,0.0
282224,1327360,0.0
282225,679104,0.0
