# Проект по оценке комментариев (отзывов) к фильмам

Необходимо добиться:
- присвоения рейтинга (от 1 до 10)
- присвоения статуса комментария (положительный или отрицательный).

### Подключаем библиотеки

In [47]:
import numpy as np
import pandas as pd
import glob
import os
from joblib import dump, load

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

### Подготавливаем train датасет 

In [3]:
dir_train_pos = []
dir_train_pos.append(r"aclImdb\train\pos")
dir_train_neg = []
dir_train_neg.append(r"aclImdb\train\neg")

In [4]:
text_train_pos = 'aclImdb_text_train_pos.txt'
lab_train_pos = 'aclImdb_labels_train_pos.txt'
text_train_neg = 'aclImdb_text_train_neg.txt'
lab_train_neg = 'aclImdb_labels_train_neg.txt'

In [5]:
def gettext(txt, dirs): # функция для получения текста отзыва   
    with open(txt , 'w', encoding = 'utf-8') as text: 
        text.write('comment' + "\n")
        for d in dirs:
            files = glob.glob(d + '\\*.txt')
            for f in files:
                content = ''
                with open(f, 'r', encoding = 'utf-8') as fo:
                    content = fo.read()
                text.write(content + "\n")

In [6]:
def getLabel(txt, dirs): # функция для получения оценки отзыва   
    with open(txt , 'w', encoding = 'utf-8') as labels:
        labels.write('rating' + "\n")
        for d in dirs:
            files = glob.glob(d + '\\*.txt')
            for f in files:
                label = os.path.basename(f)
                info = os.path.splitext(label)
                filename = info[0]
                fileInfo = filename.split("_")
                labels.write(fileInfo[1] + "\n")

In [7]:
# получаем "позитивный" датасет
gettext(text_train_pos, dir_train_pos) 
getLabel(lab_train_pos, dir_train_pos)
data_train_pos = pd.read_csv("aclImdb_text_train_pos.txt", sep='delimiter', engine='python') 
labels_train_pos = pd.read_csv("aclImdb_labels_train_pos.txt", sep='delimiter', engine='python')
train_dataset_pos = data_train_pos.join(labels_train_pos)
train_dataset_pos['neg/pos'] = '1' 

In [8]:
# получаем "негативный" датасет
gettext(text_train_neg, dir_train_neg)
getLabel(lab_train_neg, dir_train_neg)
data_train_neg = pd.read_csv("aclImdb_text_train_neg.txt", sep='delimiter', engine='python') 
labels_train_neg = pd.read_csv("aclImdb_labels_train_neg.txt", sep='delimiter', engine='python')
train_dataset_neg = data_train_neg.join(labels_train_neg)
train_dataset_neg['neg/pos'] = '0' 

In [9]:
train_dataset = pd.concat([train_dataset_pos, train_dataset_neg], axis=0,) # объединение позитивных и негативных
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True) #случаная перестановка строк, чтобы не шли подряд позитивные, потом негативные

In [10]:
train_dataset # 0 - негативный отзыв, 1 - позитивный отзыв

Unnamed: 0,comment,rating,neg/pos
0,Tony Scott directs a thriller sports flick tha...,4,0
1,Hollywood Hotel was the last movie musical tha...,4,0
2,"THis movie shows us once again, how genius the...",9,1
3,Canadian director Vincenzo Natali took the art...,10,1
4,I liked the film. Some of the action scenes we...,8,1
...,...,...,...
24995,"This movie's heart was in the right place, no ...",3,0
24996,This flick reminds me some really bad science-...,3,0
24997,This move reminded my of Tales from the Crypt ...,3,0
24998,"Tony Scott can make good films and bad, person...",9,1


In [11]:
train_dataset['rating'].unique() # в датасете оценках нет 5 и 6

array([ 4,  9, 10,  8,  1,  7,  2,  3], dtype=int64)

In [12]:
train_dataset.to_pickle("train_dataset.pkl") # cохраняем

### Подготавливаем test датасет (то же самое)

In [13]:
dir_test_pos = []
dir_test_pos.append(r"aclImdb\test\pos")
dir_test_neg = []
dir_test_neg.append(r"aclImdb\test\neg")

In [14]:
text_test_pos = 'aclImdb_text_test_pos.txt'
lab_test_pos = 'aclImdb_labels_test_pos.txt'
text_test_neg = 'aclImdb_text_test_neg.txt'
lab_test_neg = 'aclImdb_labels_test_neg.txt'

In [15]:
gettext(text_test_pos, dir_test_pos)
getLabel(lab_test_pos, dir_test_pos)
data_test_pos = pd.read_csv("aclImdb_text_test_pos.txt", sep='delimiter', engine='python') 
labels_test_pos = pd.read_csv("aclImdb_labels_test_pos.txt", sep='delimiter', engine='python')
test_dataset_pos = data_test_pos.join(labels_test_pos)
test_dataset_pos['neg/pos'] = '1' 

In [16]:
gettext(text_test_neg, dir_test_neg)
getLabel(lab_test_neg, dir_test_neg)
data_test_neg = pd.read_csv("aclImdb_text_test_neg.txt", sep='delimiter', engine='python') 
labels_test_neg = pd.read_csv("aclImdb_labels_test_neg.txt", sep='delimiter', engine='python')
test_dataset_neg = data_test_neg.join(labels_test_neg)
test_dataset_neg['neg/pos'] = '0' 

In [17]:
test_dataset = pd.concat([test_dataset_pos, test_dataset_neg], axis=0,)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

In [18]:
test_dataset

Unnamed: 0,comment,rating,neg/pos
0,Seven months since a revelatory viewing of Fac...,8,1
1,"Giorgino is a long, excruciating journey from ...",10,1
2,I don't remember too much about this movie exc...,3,0
3,"In the dusty little town of Furlough in Texas,...",1,0
4,"In Everything Is Illuminated, Elijah Wood play...",10,1
...,...,...,...
24995,The Knowledge is a typical British comedy for ...,7,1
24996,this is more than a Sat. afternoon special. Ex...,10,1
24997,The plot of this boils down to Ah-nuld versus ...,3,0
24998,Paul Thomas Anderson's stylish and compelling ...,10,1


In [19]:
test_dataset.to_pickle("test_dataset.pkl")

### Обрабатываем получившиеся датасеты

In [21]:
df_train = pd.read_pickle("train_dataset.pkl") #загружаем датафреймы
df_test = pd.read_pickle("test_dataset.pkl")

In [22]:
df_train['neg/pos'] = df_train['neg/pos'].astype(object).astype(int) # приводим object к int
df_test['neg/pos'] = df_test['neg/pos'].astype(object).astype(int)

In [24]:
sw = set(get_stop_words("en")) 
sw.add('br')
puncts = set(punctuation)
morpher = MorphAnalyzer()

In [26]:
def preprocess_text(txt): # функция для преобразования текст (удаление стоп-слов, удаление пунктуации, лемматизация)
    txt = str(txt)
    txt = "".join(c for c in txt if c not in puncts)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [27]:
df_train['comment'] = df_train['comment'].apply(preprocess_text)
df_test['comment'] = df_test['comment'].apply(preprocess_text)

In [None]:
max_words = 5000 # количество слов в нашем словаре токенов

In [29]:
train_corpus = " ".join(df_train["comment"]) # получаем словарь наиболее встречающихся слов
train_corpus = train_corpus.lower()

tokens = word_tokenize(train_corpus)
tokens_filtered = [word for word in tokens if word.isalnum()]

dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
vocabulary

{'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'just': 5,
 'good': 6,
 'even': 7,
 'time': 8,
 'really': 9,
 'story': 10,
 'see': 11,
 'can': 12,
 'much': 13,
 'well': 14,
 'get': 15,
 'will': 16,
 'also': 17,
 'people': 18,
 'bad': 19,
 'great': 20,
 'first': 21,
 'dont': 22,
 'made': 23,
 'movies': 24,
 'make': 25,
 'films': 26,
 'way': 27,
 'characters': 28,
 'think': 29,
 'watch': 30,
 'two': 31,
 'many': 32,
 'seen': 33,
 'character': 34,
 'never': 35,
 'little': 36,
 'acting': 37,
 'plot': 38,
 'best': 39,
 'love': 40,
 'know': 41,
 'life': 42,
 'show': 43,
 'ever': 44,
 'still': 45,
 'better': 46,
 'end': 47,
 'say': 48,
 'man': 49,
 'scene': 50,
 'scenes': 51,
 'go': 52,
 'something': 53,
 'im': 54,
 'back': 55,
 'doesnt': 56,
 'real': 57,
 'watching': 58,
 'years': 59,
 'though': 60,
 'now': 61,
 'thing': 62,
 'actors': 63,
 'didnt': 64,
 'another': 65,
 'new': 66,
 'actually': 67,
 'nothing': 68,
 'makes': 69,
 'find': 70,
 'work': 71,
 'funny': 72,
 'look': 73,
 'old': 74,

In [None]:
max_len = 60 

In [31]:
def text_to_sequence(text, maxlen): #функция для токенизации (получаем массивы)
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])

    padding = [0] * (maxlen-len(result))
    return result[-maxlen:] + padding

In [32]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["comment"]])
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["comment"]])

In [76]:
class DataWrapper(Dataset): #класс, чтобы получить тенсоры для обертки в datalolader
    def __init__(self, data, target, transform=None):
        self.data = torch.from_numpy(data).long()
        self.target = torch.from_numpy(target).long()
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        if self.transform:
            x = self.transform(x)
            
        return x, y
    
    def __len__(self):
        return len(self.data)

In [34]:
train_dataset = DataWrapper(x_train, df_train['neg/pos'].values)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True) #получаем объект loader

val_dataset = DataWrapper(x_test, df_test['neg/pos'].values)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

### Инициализация модели для бинарной классификации отзывов (положительный/отрицательный)

In [160]:
class Mod(nn.Module):
    def __init__(self, vocab_size=5000, embedding_dim=128, out_channel=128, num_classes=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_1 = nn.Conv1d(embedding_dim, out_channel, kernel_size=2)
        self.conv_2 = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.pool = nn.MaxPool1d(2)
        self.relu = nn.ReLU()
        self.linear_1 = nn.Linear(out_channel, out_channel // 2)
        self.linear_2 = nn.Linear(out_channel // 2, num_classes)
        self.dp = nn.Dropout(0.2)
        
    def forward(self, x):        
        output = self.embedding(x)
        output = self.dp(output)
        output = output.permute(0, 2, 1)
        output = self.conv_1(output)
        output = self.relu(output)
        output = self.pool(output)

        output = self.conv_2(output)
        output = self.relu(output)
        output = self.pool(output)
        output = torch.max(output, axis=2).values
        output = self.linear_1(output)
        output = self.relu(output)
        output = self.dp(output)
        output = self.linear_2(output)
        output = F.sigmoid(output)
        return output

In [161]:
model = Mod()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [162]:
print(model) #параметры модели
print("Parameters:", sum([param.nelement() for param in model.parameters()]))

Mod(
  (embedding): Embedding(5000, 128)
  (conv_1): Conv1d(128, 128, kernel_size=(2,), stride=(1,))
  (conv_2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ReLU()
  (linear_1): Linear(in_features=128, out_features=64, bias=True)
  (linear_2): Linear(in_features=64, out_features=1, bias=True)
  (dp): Dropout(p=0.2, inplace=False)
)
Parameters: 730497


### Обучение

In [163]:
epochs = 5 #колчество "итераций"
print_batch_n = 100
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss() # используем бинарную кроссэнтропию

In [164]:
model = model.to(device)
model.train()
th = 0.5

train_loss_history = []
test_loss_history = []

for epoch in range(epochs):  
    running_items, running_right = 0.0, 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()

        loss = loss.item()
        running_items += len(labels)
        pred_labels = torch.squeeze((outputs > th).int())
        running_right += (labels == pred_labels).sum()
        
    model.eval()
    
    print(f'Epoch [{epoch + 1}/{epochs}]. ' \
            f'Step [{i + 1}/{len(train_loader)}]. ' \
            f'Loss: {loss:.3f}. ' \
            f'Acc: {running_right / running_items:.3f}', end='. ')
    running_loss, running_items, running_right = 0.0, 0.0, 0.0
    train_loss_history.append(loss)

    test_running_right, test_running_total, test_loss = 0.0, 0.0, 0.0
    for j, data in enumerate(val_loader):
        test_labels = data[1].to(device)
        test_outputs = model(data[0].to(device))
        
        test_loss = criterion(test_outputs, test_labels.float().view(-1, 1))
        test_running_total += len(data[1])
        pred_test_labels = torch.squeeze((test_outputs > th).int())
        test_running_right += (test_labels == pred_test_labels).sum()
    
    test_loss_history.append(test_loss.item())
    print(f'Test loss: {test_loss:.3f}. Test acc: {test_running_right / test_running_total:.3f}')
    
    model.train()
        
print('Training is finished!')

Epoch [1/5]. Step [49/49]. Loss: 0.478. Acc: 0.653. Test loss: 0.244. Test acc: 0.784
Epoch [2/5]. Step [49/49]. Loss: 0.473. Acc: 0.815. Test loss: 0.617. Test acc: 0.826
Epoch [3/5]. Step [49/49]. Loss: 0.340. Acc: 0.858. Test loss: 0.039. Test acc: 0.825
Epoch [4/5]. Step [49/49]. Loss: 0.334. Acc: 0.879. Test loss: 0.597. Test acc: 0.825
Epoch [5/5]. Step [49/49]. Loss: 0.230. Acc: 0.904. Test loss: 0.485. Test acc: 0.823
Training is finished!


In [167]:
torch.save(model,'model_1.pth') # сохраняем модель

In [168]:
torch.save(model.state_dict(), 'model_1weights.pth') #cохраняем веса

### Инициализация модели для мультиклассовой классификации отзывов (от 1 до 10)

In [45]:
clf = GradientBoostingClassifier(max_depth=17, 
                                     min_samples_leaf=10,
                                     random_state=42,
                                     n_estimators=700) # параметры подбираем с помощью кросс-валидации
clf.fit(x_train, df_train['rating'].values)
y_pred = clf.predict(x_test)

In [48]:
print(accuracy_score(df_test['rating'].values, y_pred))

0.23084


In [49]:
# результат лучше получить не удалось, возможно из-за отсутствия оценок 5,6 (вряд ли)

In [50]:
dump(clf, 'model_2.joblib') # сохраняем данные

['model_2.joblib']