<a href="https://colab.research.google.com/github/Shurara-san/NLP/blob/Lesson8/Lesson8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [123]:
# pip install stop-words

In [124]:
# pip install pymorphy2

In [125]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [126]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [127]:
df_train = pd.read_csv("/content/drive/MyDrive/NLP/data/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/NLP/data/test.csv")
df_val = pd.read_csv("/content/drive/MyDrive/NLP/data/val.csv")

In [128]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [129]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking, Flatten, Reshape
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
#from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [130]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [131]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [132]:
y_train = df_train['class'].values
y_val = df_val['class'].values

In [133]:
X_valid.shape

(22683, 27)

### RNN модель с урока

In [134]:
model_RNN = Sequential()

model_RNN.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model_RNN.add(Masking(mask_value=0.0))

model_RNN.add(SimpleRNN(64))
model_RNN.add(Dense(64, activation='relu'))
model_RNN.add(Dropout(0.5))
model_RNN.add(Dense(1, activation='sigmoid'))

model_RNN.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [135]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_RNN.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [136]:
score = model_RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6214226484298706
Test accuracy: 0.7331041097640991


### CNN модель

In [137]:
model_CNN = Sequential()

model_CNN.add(
  Embedding(input_dim=word_count,
            input_length=training_length,
            output_dim=30,
            trainable=True,
            mask_zero=True))
 
model_CNN.add(Conv1D(64, kernel_size=(30), activation='tanh', padding="same"))
model_CNN.add(GlobalMaxPool1D())
model_CNN.add(Dense(64, activation='tanh'))
model_CNN.add(Dropout(0.5))
model_CNN.add(Dense(1, activation='softmax'))

model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [138]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_CNN.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [139]:
score = model_CNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6220514178276062
Test accuracy: 0.5047392249107361


### CNN -> RNN модель

In [140]:
model_CNN_RNN = Sequential()

model_CNN_RNN.add(
  Embedding(input_dim=word_count,
            input_length=training_length,
            output_dim=30,
            trainable=True,
            mask_zero=True))
 
model_CNN_RNN.add(Conv1D(64, kernel_size=(7), activation='tanh', input_shape=(None, 27, 1), padding="same"))
model_CNN_RNN.add(GlobalMaxPool1D())
model_CNN_RNN.add(Reshape((64,1)))
model_CNN_RNN.add(SimpleRNN(64))
model_CNN_RNN.add(Dense(64, activation='tanh'))
model_CNN_RNN.add(Dropout(0.5))
model_CNN_RNN.add(Dense(1, activation='softmax'))

model_CNN_RNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [141]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_CNN_RNN.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [142]:
score = model_CNN_RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.5637281537055969
Test accuracy: 0.5047392249107361


### RNN -> CNN модель

In [143]:
model_RNN_CNN = Sequential()

model_RNN_CNN.add(
  Embedding(input_dim=word_count,
            input_length=training_length,
            output_dim=30,
            trainable=True,
            mask_zero=True))
 
model_RNN_CNN.add(SimpleRNN(64, input_shape=(None, 27, 1)))
model_RNN_CNN.add(Reshape((64,1)))
model_RNN_CNN.add(Conv1D(64, kernel_size=(7), activation='tanh', padding="same"))
model_RNN_CNN.add(GlobalMaxPool1D())
model_RNN_CNN.add(Dense(64, activation='tanh'))
model_RNN_CNN.add(Dropout(0.5))
model_RNN_CNN.add(Dense(1, activation='softmax'))

model_RNN_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [144]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_RNN_CNN.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [145]:
score = model_CNN_RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.5637281537055969
Test accuracy: 0.5047392249107361
