# Анализ тональности текста (tweet'а)

Цель: построить классификатор для задачи анализа тональности текстов на основе сверточных нейронных сетей и LSTM модели.

Данные: сообщения из Twitter за 2016 год.

## Загрузка данных

In [1]:
import pandas as pd

df = pd.read_csv('./data/data.csv', header=None, usecols=[1, 2], names=['label', 'text'], sep='\t')

In [2]:
df.head()

Unnamed: 0,label,text
0,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,neutral,Order Go Set a Watchman in store or through ou...
2,negative,If these runway renovations at the airport pre...
3,neutral,If you could ask an onstage interview question...
4,positive,A portion of book sales from our Harper Lee/Go...


In [3]:
df['label'].value_counts().plot(kind='bar');

## Подготовка данных

In [4]:
MAX_TEXT_LENGTH = 140

# удаление слишком длинных записей
df = df[df['text'].map(len) <= MAX_TEXT_LENGTH]

In [5]:
# количество записей для каждой метки
NUMBER_PER_LABEL = min(df['label'].value_counts())

# семплирование на равные части
df = df.groupby('label').apply(lambda x: x.sample(NUMBER_PER_LABEL))

In [6]:
# one-hot encoding
df = df.join(pd.get_dummies(df['label']))

In [7]:
# замена ссылок на токен "URL"
df['text'] = df['text'].str.replace(r'https?://\S+', 'URL')

# замена упоминаний пользователя на токен "USER"
df['text'] = df['text'].str.replace(r'@[^\s]+', 'USER')

# удаление пунктуации
df['text'] = df['text'].str.replace(r'[^a-zA-Z0-9]+', ' ')

# приведение к нижнему регистру
df['text'] = df['text'].str.lower()

In [8]:
# токенизация
import nltk

df['tokens'] = df['text'].apply(nltk.word_tokenize)

In [9]:
# лемматизация
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatize = lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
df['tokens'] = df['tokens'].apply(lemmatize)

In [10]:
# удаление стоп-слов
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words('english'))

delete_stop_words = lambda words: [word for word in words if word not in STOP_WORDS]
df['tokens'] = df['tokens'].apply(delete_stop_words)

In [11]:
# удаление редких слов
from collections import Counter

MINIMAL_WORD_FREQUENCY = 3

counter = Counter()
for row in df['tokens']:
    counter.update(row)

delete_rare_words = lambda words: [word for word in words if counter[word] >= MINIMAL_WORD_FREQUENCY]
df['tokens'] = df['tokens'].apply(delete_rare_words)

In [12]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,text,negative,neutral,positive,tokens
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
negative,15222,negative,regardless of whether they are muslims i don t...,1,0,0,"[regardless, whether, muslim, understand, fren..."
negative,17610,negative,so i sat through that for nintendo s announcem...,1,0,0,"[sat, announcement, make, one, like, even, tra..."
negative,6289,negative,managed to loose my keys on 6th puke my guts u...,1,0,0,"[managed, loose, key, 6th, puke, next, day, so..."
negative,17932,negative,the guy who predicted the end of the world mov...,1,0,0,"[guy, predicted, end, world, moved, date, oct,..."
negative,4288,negative,yakub coverage outrage over notice to channels...,1,0,0,"[yakub, coverage, outrage, notice, channel, jo..."


### Разбиение выборки

In [13]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1)

y_train, y_test = df_train[['negative', 'neutral', 'positive']].values, df_test[['negative', 'neutral', 'positive']].values

### Векторизация текстов

In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# максимальное количество слов в тексте 
MAX_SEQUENCE_LENGTH = df['tokens'].map(len).max()

# количество слов в словаре
NUM_WORDS = 5000

def get_sequences(tokenizer, texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# создание и обучение токенизатора
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(df_train['tokens'].tolist())

# преобразование текстов в массивы идентификаторов токенов
x_train = get_sequences(tokenizer, df_train['tokens'].values)
x_test = get_sequences(tokenizer, df_test['tokens'].values)

Using TensorFlow backend.


### Векторизация слов

In [15]:
import multiprocessing

import numpy as np

import gensim
from gensim.models import Word2Vec

# размерность вектора слова
EMBEDDING_DIM = 200

# создание и обучение модели Word2Vec 
word2vec = Word2Vec(df['tokens'].values, size=EMBEDDING_DIM, window=5, min_count=3, workers=multiprocessing.cpu_count())

# построение матрицы весов embedding-слоя
embedding_matrix = np.zeros((NUM_WORDS, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if i == NUM_WORDS:
        break
    embedding_matrix[i] = word2vec.wv[word]

## Построение классификаторов

### Объявление метрик

In [16]:
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    return true_positives / (predicted_positives + K.epsilon())

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))

metrics = [precision, recall, f1]

### LSTM

In [17]:
from keras.layers import Input, Dense, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential

def build_lstm(max_sequence_length, num_words, embedding_dim, embedding_matrix, labels_index):
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(embedding_dim))
    model.add(Dense(labels_index, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=metrics)
    return model

In [18]:
lstm = build_lstm(MAX_SEQUENCE_LENGTH, NUM_WORDS, EMBEDDING_DIM, embedding_matrix, 3) 
lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 200)           1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 603       
Total params: 1,321,403
Trainable params: 1,321,403
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = lstm.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9576 samples, validate on 1064 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
from sklearn.metrics import classification_report

predicted = np.round(lstm.predict(x_test))
lstm_metrics = classification_report(y_test, predicted, digits=4,  zero_division=0)
print(lstm_metrics)

              precision    recall  f1-score   support

           0     0.6146    0.6289    0.6217       388
           1     0.4851    0.3879    0.4311       379
           2     0.6379    0.6562    0.6469       416

   micro avg     0.5887    0.5613    0.5746      1183
   macro avg     0.5792    0.5577    0.5666      1183
weighted avg     0.5813    0.5613    0.5695      1183
 samples avg     0.5613    0.5613    0.5613      1183



### CNN

In [21]:
from keras.layers import concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Model

def build_cnn(max_sequence_length, num_words, embedding_dim, embedding_matrix, labels_index):
    input_layer = Input(shape=(max_sequence_length,), dtype='int32')
    encoder_layer = Embedding(num_words, embedding_dim, input_length=max_sequence_length)(input_layer)
    
    filter_layers = []
    
    for filter_size in range(2, 7):
        layer = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(encoder_layer)
        layer = GlobalMaxPooling1D()(layer)
        filter_layers.append(layer)
    
    x = concatenate(filter_layers, axis=1)
    x = Dropout(0.1)(x)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    output_layer = Dense(labels_index, activation='softmax')(x)
    
    model = Model(input_layer, output_layer)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=metrics)
    return model

In [22]:
cnn = build_cnn(MAX_SEQUENCE_LENGTH, NUM_WORDS, EMBEDDING_DIM, embedding_matrix, 3) 
cnn.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 200)      1000000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 24, 200)      80200       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 23, 200)      120200      embedding_2[0][0]                
____________________________________________________________________________________________

In [23]:
history = cnn.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9576 samples, validate on 1064 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
predicted = np.round(cnn.predict(x_test))
cnn_metrics = classification_report(y_test, predicted, digits=4,  zero_division=0)
print(cnn_metrics)

              precision    recall  f1-score   support

           0     0.6434    0.6649    0.6540       388
           1     0.4805    0.4222    0.4494       379
           2     0.6526    0.6683    0.6603       416

   micro avg     0.6000    0.5883    0.5941      1183
   macro avg     0.5922    0.5851    0.5879      1183
weighted avg     0.5944    0.5883    0.5907      1183
 samples avg     0.5883    0.5883    0.5883      1183



In [25]:
output = 'output.txt'
with open(output, 'a') as f:
    f.write('2016, 3 класса, LSTM')
    f.write('\n')        
    f.write(lstm_metrics)
    f.write('\n')        
    f.write('2016, 3 класса, CNN')
    f.write('\n')        
    f.write(cnn_metrics)
    f.write('\n')