# Проект "Анализ тональности" для датасета twitter.csv

In [1]:
import re, string
import nltk
from nltk.corpus import stopwords
import numpy as np
import spacy

В данном проекте мы будем работать с датасетом твитов - twitter.csv

In [9]:
import pandas as pd
df = pd.read_csv('twitter.csv', encoding='utf-8')
df.head(5)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


В данном датасете первая строка является названием столбцов, поэтому следует опустить её вниз и добавить название стобцов. Также лучше сразу заменить значения NULL пустыми строками, чтобы избежать дальнейших ошибок.

In [10]:
df.columns = df.iloc[0]
df = df[1:]
df.loc[-1] = df.columns
df.index = df.index + 1
df.sort_index()
df.columns = ['id', 'entity', 'sentiment', 'text']
df = df.reset_index(drop=True)
df.index += 1
pd.set_option('display.max_colwidth', None)
df['text'] = df['text'].fillna('')
df.head(5)

Unnamed: 0,id,entity,sentiment,text
1,2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,"
2,2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,"
3,2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,"
4,2401,Borderlands,Positive,"im getting into borderlands and i can murder you all,"
5,2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg


# 1. Реализуем sentiment analysis с помощью rule-based метода

Считаем словарь положительных и отрицательных слов английского языка и добавим их в один общий словарь

In [11]:
with open('positive-words.txt', "r", encoding='utf-8') as f:
  pos = f.readlines()
pos = [i.strip() for i in pos]

In [12]:
with open('negative-words.txt', "r", encoding='utf-8', errors='ignore') as f1:
  neg = f1.readlines()
neg = [i.strip() for i in neg]

In [13]:
sdict = {}
for i in pos:
  sdict[i] = 1
for i in neg:
  sdict[i] = -1

In [14]:
def clean_text(text):
    if not isinstance(text, str): #если не строка, возвращаем пустую строку
      return ''
    text = re.sub(r'\d+', '', text)  #удаляем цифры
    text = text.lower()  #нижний регистр
    text = re.sub(r'@\w+', '', text)  #удаляем никнеймы
    text = re.sub(r'\[.*?\]', ' ', text)  #удаляем текст в квадратных скобках
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  #удаляем ссылки
    text = re.sub(r'<.*?>+', ' ', text)  #удаляем HTML-теги
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    #удаляем пунктуацию
    text = re.sub(r'\n', ' ', text)  #удаляем переносы строк
    text = re.sub(r'\s+', ' ', text).strip() #удаляем лишние пробелы
    return text.strip()

Загружаем модель англйского языка для работы с токенизацией и лемматизацией. Также загружаем стоп-слова, которые мы будем удалять из текстов.

In [None]:
spacy_nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
sw = stopwords.words('english')
nsw = ["doesn't", "didn't", "not", "haven't", "hasn't",
                       "isn't", "wasn't", "aren't", "won't", "mustn't",
                       "shouldn't", "can't", "couldn't", "cannot",
                       "needn't", "weren't", "don't", "no", "nor", "hadn't",
        "wouldn't"]
sw = [x for x in sw if x not in nsw]

In [15]:
def lemmatize_spacy(text): # лемматизация текстов
    doc = spacy_nlp(text)
    return [token.lemma_ for token in doc]

def remove_stopwords(tokens): # удаление стоп-слов
    return [t for t in tokens if t not in sw]

Применяем вышеперечисленные функции к датасету и создаем три стобца. В последнем из них будет представлен лемматизированный и очищенный список слов, из которых составлен каждый твит - df['best'].

In [16]:
df['clean_text'] = df['text'].apply(clean_text)
df['lem'] = df['clean_text'].apply(lemmatize_spacy)
df['best'] = df['lem'].apply(remove_stopwords)

Функция calculate-sentiment расчитывает коэффициент score, для каждого твита. Сначала проверяем на наличие интенсификаторов, которые на один увеличивают вес следующего слова по модулю. Далее проверяем на отрицание: если оно имеется, то вес следующего слова меняется на противоположный. Проверяем слово в словаре и добавляем его вес к общему score для твита, иначе добавляем 0.

In [17]:
def calculate_sentiment(text_tokens, sentiment_dict):
    score = 0
    for i, token in enumerate(text_tokens):
        sentiment = sentiment_dict.get(token, "Neutral")
        if token in  ["very", "extremely", "highly", "incredibly", "absolutely",
    "totally", "completely", "utterly", "really", "seriously"]:
            if i + 1 < len(text_tokens):
                next_token = text_tokens[i + 1]
                next_sentiment = sentiment_dict.get(next_token, "Neutral")
                if next_sentiment == 1:
                    score += 2
                elif next_sentiment == -1:
                    score -= 2
        elif token in ["doesn't", "didn't", "not", "haven't", "hasn't",
                       "isn't", "wasn't", "aren't", "won't", "mustn't",
                       "shouldn't", "can't", "couldn't", "cannot",
                       "needn't", "weren't", "don't", "no", "nor", "hadn't",
        "wouldn't"]:
            if i + 1 < len(text_tokens):
                next_token = text_tokens[i + 1]
                next_sentiment = sentiment_dict.get(next_token, "Neutral")
                if next_sentiment == 1:
                    score -= 1
                elif next_sentiment == -1:
                    score += 1
                continue
        if sentiment == 1:
            score += 1
        elif sentiment == -1:
            score -= 1
    return score

Ставим соответствующую метку для твита, в зависимости от score (больше нуля, меньше нуля или равно нулю)

In [18]:
def categorize_sentiment(score):
    if score > 0:
        return "Positive"
    elif score < 0:
        return "Negative"
    else:
        return "Neutral"

Применяем вышеперечисленные функции к датасету и создаём новый столбец sentiment_category, куда запишем результат анализа тональности для каждого твита

In [19]:
df['sentiment_score'] = df['best'].apply(lambda x: calculate_sentiment(x,
                                                                        sdict))
df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)
df.head(10)

Unnamed: 0,id,entity,sentiment,text,clean_text,lem,best,sentiment_score,sentiment_category
1,2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,",im getting on borderlands and i will kill you all,"[I, m, get, on, borderland, and, I, will, kill, you, all]","[I, get, borderland, I, kill]",-1,Negative
2,2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,",im coming on borderlands and i will murder you all,"[I, m, come, on, borderland, and, I, will, murder, you, all]","[I, come, borderland, I, murder]",-1,Negative
3,2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,",im getting on borderlands and i will murder you me all,"[I, m, get, on, borderland, and, I, will, murder, you, I, all]","[I, get, borderland, I, murder, I]",-1,Negative
4,2401,Borderlands,Positive,"im getting into borderlands and i can murder you all,",im getting into borderlands and i can murder you all,"[I, m, get, into, borderland, and, I, can, murder, you, all]","[I, get, borderland, I, murder]",-1,Negative
5,2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg,so i spent a few hours making something for fun if you dont know i am a huge fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy pictwittercommlsiwfjg,"[so, I, spend, a, few, hour, make, something, for, fun, if, you, do, not, know, I, be, a, huge, fan, and, maya, be, one, of, my, favorite, character, so, I, decide, to, make, myself, a, wallpaper, for, my, pc, here, be, the, original, image, versus, the, creation, I, make, enjoy, pictwittercommlsiwfjg]","[I, spend, hour, make, something, fun, not, know, I, huge, fan, maya, one, favorite, character, I, decide, make, wallpaper, pc, original, image, versus, creation, I, make, enjoy, pictwittercommlsiwfjg]",3,Positive
6,2402,Borderlands,Positive,"So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a wallpaper for my PC.. Here's the original picture compared to the creation I made:) Have fun! pic.twitter.com / mLsI5wf9Jg",so i spent a couple of hours doing something for fun if you dont know that im a huge borderlands fan and maya is one of my favorite characters i decided to make a wallpaper for my pc heres the original picture compared to the creation i made have fun pictwittercom mlsiwfjg,"[so, I, spend, a, couple, of, hour, do, something, for, fun, if, you, do, not, know, that, I, m, a, huge, borderland, fan, and, maya, be, one, of, my, favorite, character, I, decide, to, make, a, wallpaper, for, my, pc, here, the, original, picture, compare, to, the, creation, I, make, have, fun, pictwittercom, mlsiwfjg]","[I, spend, couple, hour, something, fun, not, know, I, huge, borderland, fan, maya, one, favorite, character, I, decide, make, wallpaper, pc, original, picture, compare, creation, I, make, fun, pictwittercom, mlsiwfjg]",3,Positive
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun... If you don't know I'm a HUGE @ Borderlands fan and Maya is one of my favorite characters.,so i spent a few hours doing something for fun if you dont know im a huge borderlands fan and maya is one of my favorite characters,"[so, I, spend, a, few, hour, do, something, for, fun, if, you, do, not, know, I, m, a, huge, borderland, fan, and, maya, be, one, of, my, favorite, character]","[I, spend, hour, something, fun, not, know, I, huge, borderland, fan, maya, one, favorite, character]",2,Positive
8,2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE RhandlerR fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg,so i spent a few hours making something for fun if you dont know i am a huge rhandlerr fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy pictwittercommlsiwfjg,"[so, I, spend, a, few, hour, make, something, for, fun, if, you, do, not, know, I, be, a, huge, rhandlerr, fan, and, maya, be, one, of, my, favorite, character, so, I, decide, to, make, myself, a, wallpaper, for, my, pc, here, be, the, original, image, versus, the, creation, I, make, enjoy, pictwittercommlsiwfjg]","[I, spend, hour, make, something, fun, not, know, I, huge, rhandlerr, fan, maya, one, favorite, character, I, decide, make, wallpaper, pc, original, image, versus, creation, I, make, enjoy, pictwittercommlsiwfjg]",3,Positive
9,2402,Borderlands,Positive,2010 So I spent a few hours making something for fun. . . If you don't know I am a HUGE RhandlerR fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg,so i spent a few hours making something for fun if you dont know i am a huge rhandlerr fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy pictwittercommlsiwfjg,"[so, I, spend, a, few, hour, make, something, for, fun, if, you, do, not, know, I, be, a, huge, rhandlerr, fan, and, maya, be, one, of, my, favorite, character, so, I, decide, to, make, myself, a, wallpaper, for, my, pc, here, be, the, original, image, versus, the, creation, I, make, enjoy, pictwittercommlsiwfjg]","[I, spend, hour, make, something, fun, not, know, I, huge, rhandlerr, fan, maya, one, favorite, character, I, decide, make, wallpaper, pc, original, image, versus, creation, I, make, enjoy, pictwittercommlsiwfjg]",3,Positive
10,2402,Borderlands,Positive,was,was,[be],[],0,Neutral


Расчитываем точность c помощью sklearn accuracy_score:

In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(df['sentiment'], df['sentiment_category']))

0.4091803805519476


# 2. Теперь попробуем провести анализ тональности с помощью ML метода

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() #создаём матрицу частот слов
X_train_counts = count_vect.fit_transform(df['text'])

Преобразуем матрицу частот слов в формат TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

Применяем алгоритм наивного Байеса к TF-IDF частотам и имеющейся информации о тоне твита

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, df['sentiment'])

Создаём pipeline для объединения всех процессов в один

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB())])

Разделяем данные на обучающую и тестовую выборки в отношении 70:30

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'], test_size=0.3, random_state=42)

Создаём новый Pipeline с помощью SGDClassifier, который использует стохастический градиентный спуск

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
            alpha=1e-3, random_state=42,
              max_iter=6, tol=None))])

Вычислим среднюю точность предсказаний

In [None]:
text_clf.fit(df['text'], df['sentiment'])
predicted = text_clf.predict(x_test)
np.mean(predicted == y_test)

0.6552555233206874

С помощью Grid search ищем наилучшие гиперпараметры для нашей модели, чтобы получить наибольшую точность предсказаний

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'vect__max_features': [1000, 5000, None],
    'clf__loss': ['hinge', 'log_loss'],
    'clf__penalty': ['l2', 'l1', 'elasticnet'],
    'clf__alpha': [1e-3, 1e-4],
    'clf__max_iter': [5, 10],
}
grid_search = GridSearchCV(text_clf, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(df['text'], df['sentiment'])
predicted = grid_search.predict(x_test)
accuracy = np.mean(predicted == y_test)
print("Accuracy:", accuracy)
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Accuracy: 0.6753403258201295
Best parameters found:  {'clf__alpha': 0.0001, 'clf__loss': 'log_loss', 'clf__max_iter': 10, 'clf__penalty': 'l2', 'vect__max_features': None}


# 3. Реализуем sentiment analysis с помощью модели hugging face.
Результат записываем в отдельный столбец model. Поскольку функция sent_analyze возвращает список, дополнительно создаём функцию, которая будет преобразовать список в строки для того, чтобы потом провести оценку качества модели.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def sent_analyze(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True,
                       padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Negative", 1: "Negative", 2: "Neutral",
                     3: "Positive", 4: "Positive"}
    return [sentiment_map[p] for p in torch.argmax(probabilities,
                                                   dim=-1).tolist()]

df["model"] = df["text"].apply(lambda x: sent_analyze(x))
df["model"] = df["model"].apply(lambda x: x[0])

Оценим среднюю точность

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(df['sentiment'], df['model']))

0.4088188428114246


# Сравнение результатов разных методов анализа тональности


1.   Rule-based method - 0.409
2.   ML method - 0.655 (best=0.675)
3.   Hugging Face Model - 0.408

Таким образом получается, что rule-based method неэффективно распознает тональность текста, в частности из-за того, что финальное значение score, равное 0, сложно получить (соответственно, скорее всего имеется много несовпадений по тональности Neutral).

Метод машинного обучения показывает наилучшие результаты среди представленных. Точность 0.655 является относительно хорошей, особенно по сравнению с другими методами. В процессе настройки или на других данных этот метод может достигать даже более высокой точности. Это свидетельствует о том, что метод имеет потенциал для улучшения через дообучение или оптимизацию гиперпараметров.

Показатель 0.408 также низкий и близок к результату rule-based метода. Это может указывать на то, что в данной конкретной задаче модель Hugging Face не смогла эффективно извлечь контекст или особенности текста. Также у модели есть ограничение на то, что она может принимать текст в максимум 512 символов, тогда как в данном датафрейме максимальная длина может достигать 957 символов.




In [23]:
max_length = df['text'].str.len().max()
print(max_length)

957
