In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark

✅ SparkSession активна: 4.0.1


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LoadProcessedData") \
    .getOrCreate()

print("✅ SparkSession активна:", spark.version)

ModuleNotFoundError: No module named 'google'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

output_path = '/content/drive/MyDrive/Data/pro-ed_data'
# читаем все part-файлы сразу
df = spark.read.json(output_path)

df.printSchema()
df.show(5)

In [4]:
df_nlp = df.select("text", "emotion") \
           .filter(df.text.isNotNull()) \
           .filter(df.emotion.isNotNull())


-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df_nlp.groupBy("emotion").count().show()


In [None]:
from pyspark.sql import functions as F
fractions = {
    "joy": 0.2,
    "sadness": 0.2,
    "anger": 0.25,
    "fear": 0.8,
    "love": 0.8,
    "surprise": 1.0
}

df_small = df_nlp.sampleBy(
    "emotion",
    fractions=fractions,
    seed=42
)

df_small.groupBy("emotion").count().show()


In [None]:
df_small.count()

In [None]:
df_small.groupBy("emotion").count().show()

In [None]:
df_pd = df_small.toPandas()
print(df_pd.shape)
print(df_pd["emotion"].value_counts())


In [None]:
!pip install -U scikit-learn
import sklearn
print(sklearn.__version__)

In [None]:
!pip install -U "transformers[torch]" accelerate


In [None]:
import torch
import sklearn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_pd["label"] = le.fit_transform(df_pd["emotion"])

label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_pd,
    test_size=0.1,
    stratify=df_pd["label"],
    random_state=42
)

print("Train:", train_df.shape)
print("Val:", val_df.shape)


In [None]:
import numpy as np
import torch
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)


In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class LyricsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts.tolist()
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx])
        }


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
from transformers import Trainer
import torch.nn as nn

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(
            weight=self.class_weights.to(logits.device)
        )
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=1,
    fp16=True,
    logging_steps=200,
    report_to="none"
)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=LyricsDataset(train_df.text, train_df.label),
    eval_dataset=LyricsDataset(val_df.text, val_df.label),
    tokenizer=tokenizer
)

trainer.train()


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
metrics = trainer.evaluate()
metrics


In [None]:
import numpy as np
from sklearn.metrics import f1_score, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    return {
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted"),
    }

trainer.compute_metrics = compute_metrics
metrics = trainer.evaluate()
metrics



In [None]:
preds = trainer.predict(trainer.eval_dataset)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(
    classification_report(
        y_true,
        y_pred,
        target_names=[id2label[i] for i in range(len(id2label))]
    )
)

In [None]:
save_path = "/content/drive/MyDrive/lyrics_emotion_bert"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)