<a href="https://colab.research.google.com/github/Reustlin/technical_task/blob/main/task_categorii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Виды продукции (брать только виды продукции, для которых в датасете есть не менее 500 примеров):

* Баранина
* Ягнятина
* Индейка
* Говядина
* Свинина
* Кура
* Цыпленок
* Гусь
* Буйволятина
* Оленина
* Конина
* Телятина
* Кролик
* Утка
* Куропатка
* Перепел
* Глухарь
* Страус
* Заяц
* Кенгуру
* Изюбр
* Кабан
* Коза
* Косуля
* Лось
* Марал
* Медвежатина
* Бобер
* Цесарка
* Нутрия
* Рябчик
* Тетерев
* Фазан
* Як



In [4]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13707 sha

In [5]:
import pandas as pd
import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import pymorphy2
import nltk

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('meatinfo.csv', sep = ';')

In [3]:
data.head()

Unnamed: 0,text,mtype
0,12 частей баранина 12 частей баранина,Баранина
1,"Баранина, 12 частей, зам. цена 260 руб.",Баранина
2,"Баранина, 12 частей, зам. цена 315 руб.",Баранина
3,"Баранина, 12 частей, охл.",Баранина
4,"Баранина, 12 частей, охл. цена 220 руб.",Баранина


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17893 entries, 0 to 17892
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    17893 non-null  object
 1   mtype   17892 non-null  object
dtypes: object(2)
memory usage: 279.7+ KB


In [5]:
inf = data.groupby('mtype').size().reset_index(name='Count').sort_values(by='Count', ascending=False)

In [6]:
inf

Unnamed: 0,mtype,Count
11,Говядина,8422
33,Свинина,3050
23,Кура,1571
16,Индейка,1337
7,Баранина,1116
40,Цыпленок,942
22,Кролик,334
37,Утка,195
29,Оленина,193
20,Конина,176


In [7]:
data = data.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)

In [8]:
data = data.query('mtype in ["баранина", "говядина", "индейка", "кура", "свинина", "цыпленок"]')

In [9]:
data['mtype'].unique()

array(['баранина', 'индейка', 'говядина', 'свинина', 'кура', 'цыпленок'],
      dtype=object)

Выводы по анализу данных:
1. Есть позиции, где данные плохо заполнены, но их не так много + они не выведут какую-то категорию в значение 500 наименований
2. после обработки данных у нас получается следующее количество категорий: 
    * баранина
    * говядина
    * индейка
    * кура
    * свинина
    * цыпленок

In [10]:
# Define the product classification model
class ProductClassificationModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ProductClassificationModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output[:, -1, :])
        return output


In [11]:
# Define the dataset class
class ProductDataset(Dataset):
    def __init__(self, data, labels, char_to_index):
        self.data = data
        self.labels = labels
        self.char_to_index = char_to_index
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        
        # Convert text to numerical sequence
        sequence = [self.char_to_index[char] for char in text]
        
        return torch.tensor(sequence, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [12]:
# Extract the text data and labels from the loaded dataset
train_data = data['text'].tolist()
train_labels = data['mtype'].tolist()

In [13]:
# Create the character vocabulary and index mapping
chars = sorted(list(set("".join(train_data))))
char_to_index = {char: index for index, char in enumerate(chars)}

In [14]:
# Convert labels to numerical values
selected_products = ["баранина", "говядина", "индейка", "кура", "свинина", "цыпленок"]
label_to_index = {label: index for index, label in enumerate(selected_products)}
train_labels = [label_to_index[label] for label in train_labels]

In [15]:
# Split the dataset into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)


In [16]:
# Define model parameters
input_size = len(chars)
hidden_size = 128
num_classes = len(selected_products)

In [17]:
# Create the model
model = ProductClassificationModel(input_size, hidden_size, num_classes)

In [18]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)

In [19]:
# Create the dataset and dataloader
#train_dataset = ProductDataset(train_data, train_labels, char_to_index)
#train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: (pad_sequence([i[0] for i in x], batch_first=True), torch.stack([i[1] for i in x])))


In [20]:
# Create the dataset and dataloaders for train and test sets
train_dataset = ProductDataset(train_data, train_labels, char_to_index)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, 
                              collate_fn=lambda x: (pad_sequence([i[0] for i in x], batch_first=True), torch.stack([i[1] for i in x])))

test_dataset = ProductDataset(test_data, test_labels, char_to_index)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, 
                             collate_fn=lambda x: (pad_sequence([i[0] for i in x], batch_first=True), torch.stack([i[1] for i in x])))


In [21]:
%%time
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")


Epoch 1/10, Loss: 454.85530780255795
Epoch 2/10, Loss: 308.64812633395195
Epoch 3/10, Loss: 424.48013877123594
Epoch 4/10, Loss: 294.4394370839
Epoch 5/10, Loss: 330.30525102466345
Epoch 6/10, Loss: 368.98950226046145
Epoch 7/10, Loss: 392.5196315944195
Epoch 8/10, Loss: 352.97341907024384
Epoch 9/10, Loss: 315.7093812797684
Epoch 10/10, Loss: 329.47712165117264
CPU times: user 49min 13s, sys: 42 s, total: 49min 55s
Wall time: 49min 6s


In [22]:
# Test the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 72.43%


In [1]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r"[^а-яА-Яa-zA-Z\s]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [24]:
morph = pymorphy2.MorphAnalyzer()

def tokenize_and_encode(text):
    tokens = nltk.word_tokenize(text)  # Разбить текст на токены (слова)
    lemmas = [morph.parse(token)[0].normal_form for token in tokens]  # Применить лемматизацию к каждому слову
    encoded_tokens = [char_to_index.get(char, unknown_char_index) for char in lemmas]  # Кодирование слов в числовые индексы
    return tokens

In [25]:
# Test samples
test_samples = [
    "Свинина блочная 2 сорт в наличии ООО 'АгроСоюз' реализует блочную свинину 2 сорт (80/20). Свободный объем 8 тонн. Самовывоз или доставка. Все подробности по телефону.",
    "Куриная разделка Продам кур и куриную разделку гост и халяль по хорошей цене .Тел:",
    "Говяжью мукозу Продам говяжью мукозу в охл и замороженном виде. Есть объем."
]

# Create a dataframe
df = pd.DataFrame({"text": test_samples})

# Preprocess the text data
df["text"] = df["text"].apply(preprocess_text)

# Tokenize and encode the text data
df["encoded_text"] = df["text"].apply(tokenize_and_encode)

# Define the dataset
class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe["encoded_text"].tolist()

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

# Create an instance of the dataset
test_dataset = TextDataset(df)

# Define the dataloader
batch_size = 1
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set the model in evaluation mode
model.eval()

# Run inference on the test dataset
predictions = []

model.eval()
with torch.no_grad():
    for inputs in test_dataloader:
        inputs = torch.tensor(inputs, dtype=torch.long)  # Convert inputs to a tensor
        inputs = inputs.unsqueeze(0)  # Add an extra dimension to the tensor
        outputs = model(inputs)
        predicted_labels = [selected_products[index.item()] for index in outputs.argmax(dim=1)]
        predictions.extend(predicted_labels)

# Add predictions to the dataframe
df["predicted_product"] = predictions

# Print the dataframe
print(df)


NameError: ignored