<a href="https://colab.research.google.com/github/Vicky-0222/NLP/blob/master/lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Современные инструменты и библиотеки
### Transformers NLTK Sentence-Transformers



## Collect data

#### Установка и импорт необходимых библиотек

In [None]:
!pip install pymorphy3

In [None]:
import pandas as pd
import numpy as np
import pymorphy3
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
nltk.download('punkt_tab')

#### Загрузка данных

In [None]:
df = pd.read_csv('/content/movie.csv')
df = df.dropna() # удаляем пустые строки

In [None]:
texts = df['overview']
categories = df['name']

#### Визуализация

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x='name', data=df, order=categories.value_counts().index)
plt.title('Распределение категорий')
plt.xlabel('Категории')
plt.xticks(rotation=90)
plt.ylabel('Количество')
plt.show()

## Prepare data

#### Токенизация и лемматизация

In [None]:
morph = pymorphy3.MorphAnalyzer()

def preprocessing(text):
    # токенизация
    tokens = word_tokenize(text.lower())
    # лемматизация
    lemmatized_tokens = [
        morph.parse(token)[0].normal_form
        for token in tokens if token.isalpha()]

    return ' '.join(lemmatized_tokens)

In [None]:
preprocessed_texts = texts.apply(preprocessing)
print(preprocessed_texts)

#### Векторизация через TFIDF

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(preprocessed_texts)

#### Кластеризация с помощью Spectral Clustering

In [None]:
# Масштабирование данных
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.toarray())

In [None]:
spectral = SpectralClustering(
    n_clusters=19,
    affinity='nearest_neighbors',
    n_neighbors=10,
    random_state=42
)
labels = spectral.fit_predict(X_scaled)

#### Визуализация

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    hue=labels,
    palette='viridis',
    legend='full'
)
plt.title('Spectral Clustering (PCA projection)')
plt.show()

### Сравнение результатов с реальной разметкой

In [None]:
if len(set(labels)) > 1:
    score = silhouette_score(X_scaled, labels)
    print(f"Silhouette Score: {score:.3f}")
else:
    print("Все точки в одном кластере!")

# Анализ результатов
df['cluster'] = labels
for cluster in sorted(df['cluster'].unique()):
    print(f"\n Кластер {cluster}:")
    print(df[df['cluster'] == cluster]['name'].value_counts().head(5))

## Разделение на train, test и val выборки

In [None]:
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(train_data)
print(test_data)
print(val_data)


## Подбор модели на HuggingFace

#### Установка и импорт необходимых библиотек

In [None]:
!pip install transformers datasets evaluate

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

#### Подготовка данных

In [None]:
# преобразование метки кластеров в числовой формат
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['cluster'])
val_data['label'] = label_encoder.transform(val_data['cluster'])
test_data['label'] = label_encoder.transform(test_data['cluster'])

In [None]:
# Dataset для HuggingFace
train_dataset = Dataset.from_pandas(train_data[['overview', 'label']])
val_dataset = Dataset.from_pandas(val_data[['overview', 'label']])
test_dataset = Dataset.from_pandas(test_data[['overview', 'label']])

#### Токенизация данных

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["overview"], truncation=True, padding="max_length")


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

#### Метрики предсказания

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

#### Определение модели и параметров

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    metric_for_best_model="f1",
)

## Обучение, валидация во время обучения и тестирование модели

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_test)
print(f"Test Accuracy: {test_results['eval_accuracy']:.3f}")
print(f"Test F1-score: {test_results['eval_f1']:.3f}")