In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from deeppavlov import build_model, configs




In [149]:
# Загрузка данных
df = pd.read_csv('women-clothing-accessories.3-class.balanced.csv', encoding="utf-8", engine='python', sep='\t')
print(df.head())

# Проверка на наличие пропущенных значений
print(df.isnull().sum())

# Анализ распределения классов
print(df['sentiment'].value_counts())



                                              review sentiment
0  качество плохое пошив ужасный (горловина напер...  negative
1  Товар отдали другому человеку, я не получила п...  negative
2  Ужасная синтетика! Тонкая, ничего общего с пре...  negative
3  товар не пришел, продавец продлил защиту без м...  negative
4      Кофточка голая синтетика, носить не возможно.  negative
review       0
sentiment    0
dtype: int64
negative    30000
neautral    30000
positive    30000
Name: sentiment, dtype: int64


In [151]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Векторизация текста с использованием CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [155]:
# Модель 1: KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_vec, y_train)
y_pred_knn = knn_model.predict(X_test_vec)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN Accuracy: 0.6203333333333333
              precision    recall  f1-score   support

    neautral       0.49      0.68      0.57      6060
    negative       0.65      0.52      0.58      5942
    positive       0.80      0.66      0.72      5998

    accuracy                           0.62     18000
   macro avg       0.65      0.62      0.63     18000
weighted avg       0.65      0.62      0.63     18000



In [156]:

# Модель 2: GradientBoostingClassifier
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_vec, y_train)
y_pred_gb = gb_model.predict(X_test_vec)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.6751666666666667
              precision    recall  f1-score   support

    neautral       0.54      0.68      0.60      6060
    negative       0.71      0.63      0.67      5942
    positive       0.84      0.71      0.77      5998

    accuracy                           0.68     18000
   macro avg       0.70      0.67      0.68     18000
weighted avg       0.70      0.68      0.68     18000



In [207]:
from sklearn.svm import LinearSVC  # Используем LinearSVC 

# Модель 3: LinearSVC
svm_model = LinearSVC(random_state=42, class_weight='balanced')  # Учитываем несбалансированность классов
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))




SVM Accuracy: 0.7217777777777777
              precision    recall  f1-score   support

    neautral       0.62      0.61      0.62      6060
    negative       0.71      0.71      0.71      5942
    positive       0.84      0.85      0.84      5998

    accuracy                           0.72     18000
   macro avg       0.72      0.72      0.72     18000
weighted avg       0.72      0.72      0.72     18000



In [209]:
new_df = df.head(100)

# Загрузка предобученной модели анализа тональности
sentiment_model = build_model(configs.classifiers.rusentiment_convers_bert, download=True)

# Функция для определения тональности текста
def get_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return "neutral"  # Если текст пустой или не строка, возвращаем нейтральную тональность
    return sentiment_model([text])[0]


# Применение модели анализа тональности
new_df["predicted_sentiment"] = new_df["review"].apply(get_sentiment)

# Вычисление метрик
print("DeepPavlov Accuracy:", accuracy_score(new_df["sentiment"], new_df["predicted_sentiment"]))
print(classification_report(new_df["sentiment"], new_df["predicted_sentiment"]))

2025-03-20 22:21:40.834 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/v1/classifiers/rusentiment_convers_bert/rusentiment_convers_bert_torch.tar.gz download because of matching hashes
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertFor

DeepPavlov Accuracy: 0.8
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         2
     neutral       0.00      0.00      0.00         1
    positive       0.67      1.00      0.80         2

    accuracy                           0.80         5
   macro avg       0.56      0.67      0.60         5
weighted avg       0.67      0.80      0.72         5



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["predicted_sentiment"] = new_df["review"].apply(get_sentiment)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [211]:

# Сравнение результатов
results = {
    'SVM Accuracy:': accuracy_score(y_test, y_pred_svm),
    'KNN Accuracy:': accuracy_score(y_test, y_pred_knn),
    'Gradient Boosting: ': accuracy_score(y_test, y_pred_gb),
    'DeepPavlov Accuracy: ': accuracy_score(new_df["sentiment"], new_df["predicted_sentiment"])
}

In [213]:

for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")

SVM Accuracy:: 0.7218
KNN Accuracy:: 0.6203
Gradient Boosting: : 0.6752
DeepPavlov Accuracy: : 0.8000
