In [None]:
# !pip install -q transformers accelerate bitsandbytes

# Deep Seek model

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import time
from sklearn.metrics import classification_report, roc_auc_score
import warnings
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModel, AutoTokenizer
import torch
import ast
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../../../tasks_support_system_ai/notebooks/NLP/labelled_data_1.csv')
df.drop(columns=['Unnamed: 0.1', 'Column1', 'Unnamed: 0', 'topic'], inplace=True)
df.drop(df[df.cluster == '?'].index, inplace=True)
df = df[df['cluster'].notna()]
df['cluster'] = df['cluster'].astype(int) - 1
df = df[['texts_cmb', 'cluster']]
df

Unnamed: 0,texts_cmb,cluster
0,"['жалоба', 'абонент', 'просьба', 'ответ', 'зап...",1
1,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
2,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
3,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
4,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
...,...,...
10267,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
10268,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0
10269,"['сбой', 'активация', 'просьба', 'помочь', 'ре...",2
10270,"['жалоба', 'абонент', 'абонент', 'утверждать',...",0


In [3]:
df["texts"] = df["texts_cmb"].apply(lambda x: " ".join(ast.literal_eval(x)))
df.drop(columns= ['texts_cmb'], inplace=True)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

### Предобученная модель

In [7]:
model_name = "deepseek-ai/deepseek-coder-1.3b-base"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
)

base_model = AutoModel.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
base_model

LlamaModel(
  (embed_tokens): Embedding(32256, 2048)
  (layers): ModuleList(
    (0-23): 24 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
        (up_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
        (down_proj): Linear4bit(in_features=5504, out_features=2048, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((2048,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
)

In [11]:
num_classes = df["cluster"].nunique()

In [13]:
class CustomDeepSeekClassifier(nn.Module):
    def __init__(self, base_model, hidden_size, num_classes):
        super().__init__()
        self.base_model = base_model
        dtype = next(base_model.parameters()).dtype  # Автоопределение типа

        self.dropout = nn.Dropout(0.1)
        self.norm = nn.LayerNorm(hidden_size).to(dtype)  # стабилизация
        self.classifier = nn.Linear(hidden_size, num_classes).to(dtype)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
            **kwargs
        )

        last_hidden_state = outputs.hidden_states[-1]
        cls_rep = last_hidden_state[:, 0, :]
        cls_rep = self.dropout(cls_rep)
        cls_rep = self.norm(cls_rep)
        logits = self.classifier(cls_rep)

        loss = F.cross_entropy(logits.float(), labels.long())
        return {"loss": loss, "logits": logits}


In [15]:
from torch.utils.data import Dataset

class ClusterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels[idx])
        }


In [17]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import torch.nn.functional as F
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        output = model(input_ids, attention_mask, labels=labels)
        logits = output['logits']
        loss = output['loss']

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            output = model(input_ids, attention_mask, labels=labels)
            logits = output['logits']
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds))


In [19]:
train_dataset = ClusterDataset(train_df["texts"].tolist(), train_df["cluster"].tolist(), tokenizer)
test_dataset = ClusterDataset(test_df["texts"].tolist(), test_df["cluster"].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CustomDeepSeekClassifier(base_model, hidden_size=2048, num_classes=df["cluster"].nunique())
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}")
    

Epoch 1
Train Loss: 1.6373
Epoch 2
Train Loss: 1.3904
Epoch 3
Train Loss: 1.2976
Epoch 4
Train Loss: 1.2356
Epoch 5
Train Loss: 1.1873
Epoch 6
Train Loss: 1.1344
Epoch 7
Train Loss: 1.0957
Epoch 8
Train Loss: 1.0626
Epoch 9
Train Loss: 1.0347
Epoch 10
Train Loss: 1.0101


In [28]:
evaluate(model, test_loader, device)

              precision    recall  f1-score   support

           0       0.58      1.00      0.74       787
           1       0.92      0.65      0.76        34
           2       0.57      0.13      0.22        90
           3       0.84      0.49      0.62       413
           4       1.00      0.41      0.58        95
           5       1.00      0.09      0.17       128
           6       1.00      0.12      0.21        69
           7       0.94      0.47      0.63        72
           8       0.96      0.88      0.92       246
           9       0.79      0.56      0.65        81

    accuracy                           0.68      2015
   macro avg       0.86      0.48      0.55      2015
weighted avg       0.77      0.68      0.64      2015



- Модель в целом делает предсказания с разумной точностью на уровне 68%, что приемлемо для многоклассовой задачи с 10 кластерами, но есть явные зоны для улучшения.

- Некоторые классы сильно преобладают по количеству примеров (например, класс 0 — 787, класс 1 — 34), что влияет на метрики:

    - Класс 0: высокая полнота (recall = 1.00) — модель почти всегда предсказывает его, но точность (precision = 0.58) невысока — много ложных срабатываний.

    - Классы 5, 6: высокие precision (1.00), но очень низкий recall (0.09, 0.12) — модель почти не предсказывает эти классы, но когда предсказывает, попадает точно.
    - Класс 2: низкие все метрики — f1 = 0.22, модель почти не умеет его находить.

    - Классы 5, 6: такие же проблемы — плохой recall, очень мало обнаруженных случаев.


**Рекомендации по улучшению:**

- Применить class weights в CrossEntropyLoss или WeightedRandomSampler в DataLoader. Увеличить количество эпох обучения

- Аугментация малых классов или oversampling.

- Fine-tune или использовать scheduler

In [32]:
import os

save_path = "../../../tasks_support_system_ai/notebooks/NLP"
os.makedirs(save_path, exist_ok=True)
torch.save(model, os.path.join(save_path, "full_model.pt"))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_path)

base_model = AutoModel.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32
)

model = CustomDeepSeekClassifier(base_model, hidden_size=2048, num_classes=num_classes)
model.load_state_dict(torch.load(os.path.join(save_path, "pytorch_model.bin")))
model.to(device)
model.eval()

In [None]:
model = torch.load(os.path.join(save_path, "full_model.pt"))
model.to(device)
model.eval()

In [34]:
def classify_texts(texts: list[str], model, tokenizer, device):
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt", max_length=128)
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=torch.zeros(len(texts)).long().to(device))
        logits = outputs["logits"]
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    return predictions

texts = ["Жалоба", "Заведите номер в blacklist"]
preds = classify_texts(texts, model, tokenizer, device)
print(preds)


[0 3]
