In [None]:
!unzip /content/profession.zip -d my_data

In [None]:
import os
import sqlite3
import pandas as pd

"""
Строчки для формирования pandas.DataFrame спарсенных данных
Спарсенные данные представляют собой 21 файл формата .dp
Каждый файл назван в соотвествии с профессией
"""

tables = []

professions = os.listdir('/content/my_data/professions')
for profession in professions:
    path = os.path.join('/content/my_data/professions', profession)
    dat = sqlite3.connect(path)
    query = dat.execute("SELECT * From mytable")
    cols = [column[0] for column in query.description]
    results = pd.DataFrame.from_records(data = query.fetchall(), columns = cols)
    results['profession'] = profession[:-3]
    tables.append(results)

df = pd.concat(tables)
df = df.sample(frac=1).reset_index(drop=True).head()

In [None]:
!pip install pymorphy2
!pip install navec
!wget -c https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

In [None]:
from catboost import CatBoostClassifier, Pool
import numpy as np
import pymorphy2
from navec import Navec
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

'''
Реализован класс для предобработки данных
Столбцы, содержащие название вузов категориально кодируются
Столбец, содержащий информацию о группах, на которые подписан пользователь,
лемматизируется и представляется в виде среднего вектора w2v
'''
class PreProcessing():
    def __init__(self):
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokenize = lambda word: word_tokenize(word)
        self.navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')
        self.model = CatBoostClassifier()

    def transform(self, df, mode='train'):
        lemmatize_word = lambda word: self.morph.parse(word)[0].normal_form

        columns = ['chair_name', 'name', 'faculty_name', 'groups']
        labels = None
        if mode == 'train':
            labels = df['profession']

        df = df[columns].replace('', 'No information available')

        df['groups'] = df.apply(lambda row: ' '.join(row), axis=1)
        df['groups'] = df['groups'].apply(lambda x: ' '.join([lemmatize_word(word) for word in x.split()]))
        df['groups'] = df['groups'].apply(lambda x: ' '.join(self.tokenize(x)))

        data = []

        for text in df['groups'].tolist():
            text = [self.navec[word] if word in self.navec else self.navec['<unk>'] for word in text.split()]
            text = np.mean(text, axis=0)
            data.append(text)

        for i in range(300):
            col_name = f'word2vec_feature_{i}'
            df[col_name] = [vector[i] for vector in data]
        df = df.drop(['groups'], axis=1)
        return df, labels

    def predict(self, df):
        self.model = self.model.load_model('path')
        return self.model.predict_proba(df)

In [None]:
preproc = PreProcessing()
data, _labels = preproc.transform(df, mode='test')

In [None]:
!pip install catboost

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(data, labels)

In [None]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(X_train, y_train, cat_features=['chair_name', 'name', 'faculty_name'])
test_pool = Pool(X_val, y_val, cat_features=['chair_name', 'name', 'faculty_name'])
cb = CatBoostClassifier(loss_function='MultiClass', eval_metric='Accuracy')
cb.fit(train_pool, eval_set=test_pool, verbose=50)

Learning rate set to 0.114166
0:	learn: 0.1280488	test: 0.1130611	best: 0.1130611 (0)	total: 3.65s	remaining: 1h 48s
50:	learn: 0.3660384	test: 0.2789581	best: 0.2789581 (50)	total: 3m 41s	remaining: 1h 8m 39s
100:	learn: 0.4612589	test: 0.3271753	best: 0.3271753 (100)	total: 7m 21s	remaining: 1h 5m 33s
150:	learn: 0.5261764	test: 0.3559948	best: 0.3559948 (150)	total: 10m 53s	remaining: 1h 1m 13s
200:	learn: 0.5868441	test: 0.3657861	best: 0.3659708 (196)	total: 14m 26s	remaining: 57m 24s
250:	learn: 0.6283567	test: 0.3748384	best: 0.3752078 (249)	total: 18m 2s	remaining: 53m 50s
300:	learn: 0.6642646	test: 0.3816737	best: 0.3820432 (295)	total: 21m 43s	remaining: 50m 27s
350:	learn: 0.6964154	test: 0.3857380	best: 0.3875854 (347)	total: 25m 25s	remaining: 46m 59s
400:	learn: 0.7222222	test: 0.3886939	best: 0.3910955 (390)	total: 29m 4s	remaining: 43m 25s
450:	learn: 0.7479059	test: 0.3920192	best: 0.3940514 (439)	total: 32m 41s	remaining: 39m 47s
500:	learn: 0.7706332	test: 0.3971919

<catboost.core.CatBoostClassifier at 0x7b9a40fa5330>

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_pred = cb.predict(X_val)

In [None]:
score = f1_score(y_val, y_pred, average="macro")
print("Macro F1-Score: ", score)

score = f1_score(y_val, y_pred, average="micro")
print("Micro F1-Score: ", score)

score = f1_score(y_val, y_pred, average="weighted")
print("Weighted F1-Score: ", score)

Macro F1-Score:  0.4081806736992642
Micro F1-Score:  0.4164049510437835
Weighted F1-Score:  0.41278643006139554


In [None]:
cb.save_model('model')

In [None]:
model.classes_

array(['agronom', 'analitik', 'architect', 'biology', 'bloger',
       'buhgalter', 'doctor', 'fotograph', 'hudozhnik', 'parikmaher',
       'pchelovod', 'perevodchik', 'povar', 'prodavec', 'programist',
       'slesar', 'smm', 'stroitel', 'sysadmin', 'teacher', 'yurist'],
      dtype=object)

In [None]:
model.predict_proba(data)[1]

array([0.02227923, 0.00397339, 0.0561539 , 0.00952572, 0.01547192,
       0.04253163, 0.00759451, 0.03554479, 0.02731024, 0.01861499,
       0.00667605, 0.01333834, 0.57056178, 0.03376192, 0.0090315 ,
       0.00550969, 0.02991629, 0.02112361, 0.01324047, 0.00546591,
       0.05237415])