In [2]:
import pandas as pd
import numpy as np
import re
import json
import os
import warnings
warnings.filterwarnings('ignore')
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.layers import Dense, LSTM, Input, Dropout, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences


from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package omw-1.4 to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Владислав
[nltk_data]     Дзюба\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Реализуем алгоритм логистической регрессии

In [4]:
### функция для предобработки текстов
def preprocess(link,status):
    ## открываем папку по названию link
    content = os.listdir(link)
    data_list = []
    
    ### циклом добавляем содержимое папки в список, предварительно очищая от мусорных символов
    for i in content:
        with open(f'{link}/{i}','r',encoding='utf-8') as t:
            data = t.read().lower()
            data = re.sub('[^a-z ]','', data)
            data_list.append(data)
    
    ## создаем датасет
    df = pd.DataFrame({'Comments':data_list,'ID':content})
    ### присваеваем статус комментариям
    df['Rating']=status
    df['evaluation'] = df['ID'].apply(lambda x: x[-6:-4]).apply(lambda y: y.replace('_','') if '_' in y else y)
    
    
    ### выкидываем стоп-слова    
    df['Comments'] = df['Comments'].apply(lambda x: [i for i in x.split(' ') if i not in stopwords_set])
    
    ### лемматизируем слова отзывов
    df['Comments'] = df['Comments'].apply(lambda b: [wordnet_lemmatizer.lemmatize(word) for word in b])
    
    ### обьединяем каждый отзывыв из списка обратно в строку
    df['Comments'] = df['Comments'].apply(lambda s: ' '.join(s))
    
    return df

In [5]:
### обработаем все 4 датасета: тестовые и тренировочные
df_neg_train = preprocess('neg_train',0)
df_pos_train = preprocess('pos_train',1)
df_neg_test = preprocess('neg_test',0)
df_pos_test = preprocess('pos_test',1)

In [6]:
### обьединим все в один дата сэт
new_df = pd.concat([df_pos_train, df_neg_train,df_pos_test,df_neg_test],ignore_index=True)
new_df = new_df.sample(frac=1,random_state = 42).reset_index(drop=True)
new_df.drop_duplicates()
new_df.head(20)

Unnamed: 0,Comments,ID,Rating,evaluation
0,first saw ad like oh go he done high school mu...,6449_8.txt,1,8
1,girl folly sort halfcomedy halfmockumentary lo...,7235_8.txt,1,8
2,started watching show first season beginning p...,1017_8.txt,1,8
3,interesting usual porn movie fantasy adventure...,9954_8.txt,1,8
4,suppose film supposed cool looking back ...,11791_2.txt,0,2
5,poor film certainly belongs make feature film ...,3452_2.txt,0,2
6,saw movie new later rented japan three year af...,8491_7.txt,1,7
7,meandering tale mob revenge simply interesting...,954_4.txt,0,4
8,first didnt like acting really hamlet standard...,2480_8.txt,1,8
9,spectacle hard fault nihon chinbotsu japanese ...,9513_9.txt,1,9


In [7]:
### обучим вектораейзер на корпусе всех слов со всех отзывов
data_corp = [ " ".join(new_df[new_df['Rating'] == l]['Comments'].tolist()) for l in list(new_df.Rating.unique()) ]
vectorizer = TfidfVectorizer()#ngram_range=(1,2)
vectorizer.fit(data_corp)

res_tfidf = vectorizer.transform(new_df['Comments'].tolist())
res_tfidf

<50000x164761 sparse matrix of type '<class 'numpy.float64'>'
	with 4877267 stored elements in Compressed Sparse Row format>

In [8]:
### стандартная процедура
X_tr, X_ts, y_tr, y_ts=train_test_split(res_tfidf, new_df['Rating'], test_size=0.2, random_state=42)

In [9]:
### обучаем модель и смотрим результат
lr = LogisticRegression(solver='liblinear',penalty='l2').fit(X_tr, y_tr)

y_pred_test = lr.predict(X_ts)
y_pred_train = lr.predict(X_tr)



print('train:',f1_score(y_tr, y_pred_train))
print('test:',f1_score(y_ts, y_pred_test))

train: 0.9086444251519286
test: 0.8826870348609479


In [22]:
### сохраняем модель
rom joblib import dump, load

dump(lr, 'vectorizer_model.joblib') 
dump(vectorizer, 'vectorizer_model.joblib') 

### Реализуем такой же сентимент-анализ на LSTM сети

In [10]:
### пределим сколько у нас положительных и отрицательных отзывов
count_pos = len(df_pos_train)+len(df_pos_test)
count_neg = len(df_neg_train)+len(df_neg_test)

total_count = count_pos+count_neg

In [12]:
### зададим параметры для токенизации, 10000 слов кажется оптимальным, но не самым лучшим
maxWordsCount = 10000
tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(new_df['Comments'])

In [13]:
dist = list(tokenizer.word_counts.items())
print(dist[:10])
print(new_df['Comments'][0][:100])

[('first', 17202), ('s', 120731), ('w', 115108), ('d', 91861), ('like', 40556), ('oh', 3047), ('go', 17412), ('he', 23228), ('done', 5856), ('high', 3954)]
first saw ad like oh go he done high school musical cant coast along he making appearance disney sho


In [15]:
#### определим длинну каждого текста, под которую они будут подгонятся нулями или обрезаться
max_text_len = 150
data = tokenizer.texts_to_sequences(new_df['Comments'])
data_pad = pad_sequences(data, maxlen=max_text_len)
print(data_pad)

[[   0    0    0 ...  177   34   72]
 [   0    0    0 ... 5598 1186    8]
 [   0    0    0 ...  103  111   34]
 ...
 [  58   45    7 ...  279   57 1031]
 [   0    0    0 ...  247    6  882]
 [3305   12 4969 ...    7   64 2156]]


In [16]:
### создадим целевую переменную в нужно для подачи в сеть формате
prep_y = []
for i in new_df['Rating']:
    if i == 1:
        prep_y.append([1,0])
    else:
        prep_y.append([0,1])


In [17]:
X = data_pad
Y = np.array(prep_y)
print(X.shape, Y.shape)

(50000, 150) (50000, 2)


In [24]:
model = Sequential()
model.add(Embedding(maxWordsCount, 128, input_length = max_text_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 128)          1280000   
                                                                 
 lstm_2 (LSTM)               (None, 150, 128)          131584    
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                                 
Total params: 1,461,122
Trainable params: 1,461,122
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=Adam(0.0001))

In [26]:
from tensorflow.keras.callbacks import ModelCheckpoint
model_lstm_save_path = '32_model_lstm.h5' ### для автоматического сохранения лучшей модели
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [27]:
history = model.fit(X, Y, batch_size=32, epochs=5,validation_split=0.2,callbacks=[checkpoint_callback_lstm])#

Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.86590, saving model to 32_model_lstm.h5
Epoch 2/5
Epoch 2: val_accuracy did not improve from 0.86590
Epoch 3/5
Epoch 3: val_accuracy improved from 0.86590 to 0.86620, saving model to 32_model_lstm.h5
Epoch 4/5
Epoch 4: val_accuracy improved from 0.86620 to 0.86680, saving model to 32_model_lstm.h5
Epoch 5/5
Epoch 5: val_accuracy did not improve from 0.86680


In [92]:
### Отметка первого нейрона это положительный отзыв, значит np.argmax с выходом 0 значит, что отзыв положительный
t = "I liked the movie I will watch it again very cool".lower()
data2 = tokenizer.texts_to_sequences([t])

data_pad2 = pad_sequences(data2, maxlen=max_text_len)


res = model.predict(data_pad2)
print(res, np.argmax(res), sep='\n')

[[0.7361074  0.26389262]]
0


In [98]:
### На основании значения выходного нейрона можно распределять значения оценки по 10-бальной шкале

if np.argmax(res)==0:
    status='Положительный:'
    if 0.6 <res[0][0]< 0.75:
        score = '7/10'
    elif 0.75 <res[0][0]< 0.9:
        score = '8/10'
    else:
        score = '9/10'
else:
    status='Отрицательный'
    ### по аналогии можно прописать градацию
    
print(status,score)

Положительный: 7/10


In [102]:
dump(tokenizer, 'tokenizer.joblib') 

['tokenizer.joblib']