In [None]:
''' tensorflow 2.0 keras text classification '''

In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

df1 = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv', delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df1.head()

In [None]:
df2 = pd.read_csv('../input/imdb-review-dataset/imdb_master.csv',encoding="latin-1")
df2.head()

In [None]:
df2 = df2.drop(['Unnamed: 0', 'type','file'], axis=1)
df2.columns = ['review', 'sentiment']
df2.head()

In [None]:
df2 = df2[df2.sentiment != 'unsup']
df2['sentiment'] = df2['sentiment'].map({'pos' : 1, 'neg': 0})
df2.head()

In [None]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df.head()

In [None]:
# 重点：去除停用词、构建特征集
import re
from nltk.stem import WordNetLemmatizer # 词干
from nltk.corpus import stopwords # 停用词

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # re.sub(pattern, replace, text, ...) 
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE) # 将单字符或者特殊字符替换成 空
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(' ')]
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

df['Processed_Reviews'] = df['review'].apply(lambda x: clean_text(x))
df.head()

In [None]:
df['Processed_Reviews'].apply(lambda x: len(x.split(" "))).mean()

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
max_features = 6000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(df['ProcessLookupError'])
list_tokenized_train = tokenizer.texts_to_sequences(df['Processed_Reviews'])

# padding
maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df['sentiment']

In [None]:
# build model
build_model():
    embed_size = 128
    model = Sequential()
    model.add(Embedding(max_features, embed_size))
    # 双向lstm效果好， return_sequences = True表示每一个隐含层的ht都返回
    model.add(Bidirectional(LSTM(32, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    # kernel_regularizer可以选择l2或者l1
    model.add(Dense(20, kernel_regularizer=regularizers.l2(0.0001), activation="relu"))
    # dropout rate
    model.add(Dropout(0.05))
    model.add(Dense(1, activation="sigmoid"))
    
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        0.001,
        decay_steps=STEPS_PER_EPOCH*1000,
        decay_rate=1,
        staircase=False)

    def get_optimizer():
        return tf.keras.optimizers.Adam(lr_schedule)

    model.compile(
        optimizer=get_optimizer(), 
        loss='binary_crossentropy', 
        metrics=['accuracy'])
    
    return model

In [None]:
batch_size = 100
epochs = 3

# patience 值用来检查改进 epochs 的数量
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_t, y, batch_size = batch_size, epochs = epochs, validation_split = 0.2, verbose=2, callbacks=[early_stop])

In [None]:
df_test=pd.read_csv("../input/word2vec-nlp-tutorial/testData.tsv",header=0, delimiter="\t", quoting=3)
df_test.head()
df_test["review"]=df_test.review.apply(lambda x: clean_text(x))
df_test["sentiment"] = df_test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
y_test = df_test["sentiment"]
list_sentences_test = df_test["review"]
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
prediction = model.predict(X_te)
y_pred = (prediction > 0.5)
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)