In [None]:
import os
import random
import math
from pathlib import Path

import numpy as np
import pandas as pd

import torch

print("Версия Python:", os.sys.version)
print("Версия PyTorch:", torch.__version__)
print("CUDA доступна:", torch.cuda.is_available())

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


In [None]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
#pip install transformers datasets accelerate sentencepiece

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset

MODEL_NAME = "Vikhrmodels/Vikhr-Gemma-2B-instruct"  
print("Модель:", MODEL_NAME)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

# tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
DATA_DIR = Path("./data")  
train_llm_path = DATA_DIR / "train_llm.csv"
test_llm_path = DATA_DIR / "test_llm.csv"

train_llm = pd.read_csv(train_llm_path)
test_llm = pd.read_csv(test_llm_path)

print("train_llm shape:", train_llm.shape)
print("test_llm shape:", test_llm.shape)
train_llm.head()


In [None]:
def build_instruction(example):
    return f"""Задача: Классифицируй текст на одну из двух категорий.
            Текст: {example['prompt']}

            Классифицируй текст как:
            - 1 (положительный), если текст относится к положительному классу
            - 0 (отрицательный), если текст относится к отрицательному классу

            Ответ: {example['response']}"""

train_texts = [build_instruction(row) for _, row in train_llm.iterrows()]

train_dataset = Dataset.from_dict({"text": train_texts})

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_train


In [None]:
output_dir = "./llm_russian_baseline"

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=1,          # увеличивай при необходимости
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=50,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


In [None]:
model.eval()

def build_prompt_for_inference(prompt_text: str) -> str:
    # Формат должен совпадать с форматом на обучении
    return f"Инструкция: {prompt_text} Ответ:"

def generate_answer(prompt_text: str, max_new_tokens: int = 64) -> str:
    input_text = build_prompt_for_inference(prompt_text)
    inputs = tokenizer(
        input_text,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            pad_token_id=tokenizer.eos_token_id
        )

    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if "Ответ:" in full_text:
        answer_part = full_text.split("Ответ:", 1)[1]
    else:
        answer_part = full_text
    return answer_part.strip()


test_predictions = []
for i, row in test_llm.iterrows():
    ans = generate_answer(row["prompt"])
    test_predictions.append(ans)
    if i < 3:
        print("PROMPT:", row["prompt"])
        print("ANSWER:", ans)
        print("=" * 50)

submission_llm = test_llm.copy()
submission_llm["response"] = test_predictions

submission_llm_path = Path("./submission_llm.csv")
submission_llm.to_csv(submission_llm_path, index=False)
submission_llm_path


#Звук

In [None]:
try:
    import torchaudio
    print("torchaudio версия:", torchaudio.__version__)
except ImportError as e:
    print("torchaudio не найден. При необходимости установи или замени на librosa.")
    raise e


In [None]:
train_audio_path = DATA_DIR / "train_audio.csv"
test_audio_path = DATA_DIR / "test_audio.csv"

train_audio = pd.read_csv(train_audio_path)
test_audio = pd.read_csv(test_audio_path)

print(train_audio.head())
print(test_audio.head())

label2id = {lbl: i for i, lbl in enumerate(sorted(train_audio["label"].unique()))}
id2label = {i: lbl for lbl, i in label2id.items()}

num_classes = len(label2id)
num_classes, label2id


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

class AudioDataset(Dataset):
    def __init__(self, df, data_dir: Path, label2id=None, train: bool = True, sample_rate: int = 16000):
        self.df = df.reset_index(drop=True)
        self.data_dir = data_dir
        self.train = train
        self.label2id = label2id
        self.sample_rate = sample_rate

        self.resampler = torchaudio.transforms.Resample(orig_freq=None, new_freq=sample_rate)
        self.melspec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=1024,
            hop_length=256,
            n_mels=64
        )
        self.ampl2db = torchaudio.transforms.AmplitudeToDB()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav_path = self.data_dir / row["path"]

        waveform, sr = torchaudio.load(str(wav_path))
        if sr != self.sample_rate:
            waveform = self.resampler(waveform)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        melspec = self.melspec(waveform)  
        melspec_db = self.ampl2db(melspec)
        melspec_db = (melspec_db - melspec_db.mean()) / (melspec_db.std() + 1e-6)

        if self.train and self.label2id is not None:
            label = self.label2id[row["label"]]
            return melspec_db, label
        else:
            return melspec_db


In [None]:
class SimpleAudioCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool2d((8, 8))
        self.fc1 = nn.Linear(64 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_audio, test_size=0.2, stratify=train_audio["label"], random_state=42)

train_ds = AudioDataset(train_df, DATA_DIR, label2id=label2id, train=True)
val_ds = AudioDataset(val_df, DATA_DIR, label2id=label2id, train=True)
test_ds = AudioDataset(test_audio, DATA_DIR, label2id=None, train=False)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

audio_model = SimpleAudioCNN(num_classes=num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(audio_model.parameters(), lr=1e-3)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    total_correct = 0
    total_count = 0
    for x, y in loader:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == y).sum().item()
        total_count += x.size(0)
    return total_loss / total_count, total_correct / total_count

def eval_epoch(model, loader):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(DEVICE)
            y = y.to(DEVICE)
            logits = model(x)
            loss = criterion(logits, y)
            total_loss += loss.item() * x.size(0)
            preds = logits.argmax(dim=1)
            total_correct += (preds == y).sum().item()
            total_count += x.size(0)
    return total_loss / total_count, total_correct / total_count

EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = train_epoch(audio_model, train_loader)
    val_loss, val_acc = eval_epoch(audio_model, val_loader)
    print(f"Epoch {epoch}: train_loss={tr_loss:.4f}, train_acc={tr_acc:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")


In [None]:
audio_model.eval()
test_preds_ids = []
with torch.no_grad():
    for x in test_loader:
        x = x.to(DEVICE)
        logits = audio_model(x)
        preds = logits.argmax(dim=1).cpu().numpy().tolist()
        test_preds_ids.extend(preds)

test_labels = [id2label[i] for i in test_preds_ids]

submission_audio = test_audio.copy()
submission_audio["label"] = test_labels
submission_audio_path = Path("./submission_audio.csv")
submission_audio.to_csv(submission_audio_path, index=False)
submission_audio_path


In [None]:
train_rec_path = DATA_DIR / "train_rec.csv"
test_rec_path = DATA_DIR / "test_rec.csv"

if train_rec_path.exists() and test_rec_path.exists():
    train_rec = pd.read_csv(train_rec_path)
    test_rec = pd.read_csv(test_rec_path)

    print(train_rec.head())
    print(test_rec.head())
else:
    print("Файлы train_rec/test_rec не найдены. Пропусти этот блок или добавь свои файлы.")

In [None]:
if 'train_rec' in globals():
    global_mean = train_rec["rating"].mean()

    item_mean = train_rec.groupby("item_id")["rating"].mean()
    user_mean = train_rec.groupby("user_id")["rating"].mean()

    def predict_baseline(u, i):
        if i in item_mean:
            return item_mean[i]
        elif u in user_mean:
            return user_mean[u]
        else:
            return global_mean

    preds = []
    for _, row in test_rec.iterrows():
        u = row["user_id"]
        i = row["item_id"]
        preds.append(predict_baseline(u, i))

    submission_rec = test_rec.copy()
    submission_rec["rating"] = preds
    submission_rec_path = Path("./submission_rec_baseline.csv")
    submission_rec.to_csv(submission_rec_path, index=False)
    submission_rec_path


In [None]:
if 'train_rec' in globals():
    # Зашиваем индексацию
    unique_users = train_rec["user_id"].unique()
    unique_items = train_rec["item_id"].unique()

    user2idx = {u: idx for idx, u in enumerate(unique_users)}
    item2idx = {i: idx for idx, i in enumerate(unique_items)}

    train_rec["user_idx"] = train_rec["user_id"].map(user2idx)
    train_rec["item_idx"] = train_rec["item_id"].map(item2idx)

    class RecDataset(torch.utils.data.Dataset):
        def __init__(self, df):
            self.user_idx = df["user_idx"].values
            self.item_idx = df["item_idx"].values
            self.rating = df["rating"].values.astype("float32")

        def __len__(self):
            return len(self.rating)

        def __getitem__(self, idx):
            return (
                torch.tensor(self.user_idx[idx], dtype=torch.long),
                torch.tensor(self.item_idx[idx], dtype=torch.long),
                torch.tensor(self.rating[idx], dtype=torch.float32),
            )

    class MFModel(nn.Module):
        def __init__(self, n_users, n_items, n_factors=32):
            super().__init__()
            self.user_emb = nn.Embedding(n_users, n_factors)
            self.item_emb = nn.Embedding(n_items, n_factors)
            self.user_bias = nn.Embedding(n_users, 1)
            self.item_bias = nn.Embedding(n_items, 1)
            self.global_bias = nn.Parameter(torch.zeros(1))

        def forward(self, user_idx, item_idx):
            u = self.user_emb(user_idx)
            v = self.item_emb(item_idx)
            dot = (u * v).sum(dim=1)
            bu = self.user_bias(user_idx).squeeze(-1)
            bi = self.item_bias(item_idx).squeeze(-1)
            return dot + bu + bi + self.global_bias

    rec_ds = RecDataset(train_rec)
    rec_loader = torch.utils.data.DataLoader(rec_ds, batch_size=1024, shuffle=True)

    n_users = len(unique_users)
    n_items = len(unique_items)
    mf_model = MFModel(n_users, n_items).to(DEVICE)
    mf_optimizer = torch.optim.Adam(mf_model.parameters(), lr=1e-2)
    mf_criterion = nn.MSELoss()

    EPOCHS_MF = 5  # можно изменить
    for epoch in range(1, EPOCHS_MF + 1):
        mf_model.train()
        total_loss = 0
        total_count = 0
        for u_idx, i_idx, r in rec_loader:
            u_idx = u_idx.to(DEVICE)
            i_idx = i_idx.to(DEVICE)
            r = r.to(DEVICE)

            mf_optimizer.zero_grad()
            pred = mf_model(u_idx, i_idx)
            loss = mf_criterion(pred, r)
            loss.backward()
            mf_optimizer.step()

            total_loss += loss.item() * r.size(0)
            total_count += r.size(0)
        print(f"[MF] Epoch {epoch}: train_loss={total_loss / total_count:.4f}")

    # Предсказания для теста (если item/user не в train, fallback на global_mean)
    preds_mf = []
    mf_model.eval()
    with torch.no_grad():
        for _, row in test_rec.iterrows():
            u = row["user_id"]
            i = row["item_id"]
            if (u in user2idx) and (i in item2idx):
                u_idx = torch.tensor([user2idx[u]], dtype=torch.long).to(DEVICE)
                i_idx = torch.tensor([item2idx[i]], dtype=torch.long).to(DEVICE)
                r_hat = mf_model(u_idx, i_idx).item()
            else:
                r_hat = global_mean
            preds_mf.append(r_hat)

    submission_rec_mf = test_rec.copy()
    submission_rec_mf["rating"] = preds_mf
    submission_rec_mf_path = Path("./submission_rec_mf.csv")
    submission_rec_mf.to_csv(submission_rec_mf_path, index=False)
    submission_rec_mf_path
