In [None]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "12"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input,  CuDNNLSTM, Embedding, Dropout, Activation, Conv1D, CuDNNGRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import Callback

In [None]:
EMBEDDING_FILE = '/home/xq/data/embed/wiki.zh.vec'
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/testa.csv")
val = pd.read_csv("../inputs/vali.csv")
X_train = train["content"].fillna("无").str.lower()
x_val = val["content"].fillna("无").str.lower()
X_test = test["content"].fillna("无").str.lower()

In [None]:
columns = train.columns.tolist()[2:]

In [None]:
from keras.utils.np_utils import to_categorical
from typing import List
def get_y(df:pd.DataFrame, cols:List[str]=columns) -> List[np.array]:
    y_list = []
    for col in cols:
        y = df[col].values + 2
        y_ = to_categorical(y, num_classes=4)
        y_list.append(y_)
    return y_list

In [None]:
y_train = get_y(train)

In [None]:
y_val = get_y(val)

In [None]:
max_features=50000
maxlen=200
embed_size=300

In [None]:
tok=Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=pad_sequences(X_train,maxlen=maxlen)
x_test=pad_sequences(X_test,maxlen=maxlen)

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector        

In [None]:
from  tensorflow.keras.optimizers import Adam, RMSprop
from  tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from  tensorflow.keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "../ckpt/best_model_bigru_cnn_2.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 10)
reduce_plateau = ReduceLROnPlateau(factor=0.5, patience=2, verbose = 1)

In [None]:
def build_model_0(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0, classes=20):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    y = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
    
    
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    ys = []
    for i in range(classes):
        y = Dense(4, activation = "softmax")(x)
        ys.append(y)
    model = Model(inputs = inp, outputs = ys)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 256, epochs = 100,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop, reduce_plateau])
    model = load_model(file_path)
    return model
# 从卷基层开始不共享
def build_model_2(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.4, classes=20):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    y = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    ys = []
    for i in range(classes):
        x2 = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)

        y2 = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)

        avg_pool1 = GlobalAveragePooling1D()(x2)
        max_pool1 = GlobalMaxPooling1D()(x2)

        avg_pool2 = GlobalAveragePooling1D()(y2)
        max_pool2 = GlobalMaxPooling1D()(y2)


        x3 = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
        y3 = Dense(4, activation = "softmax")(x3)
        ys.append(y3)
    model = Model(inputs = inp, outputs = ys)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 256, epochs = 100,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop, reduce_plateau])
    model = load_model(file_path)
    return model


In [None]:
model = build_model_2(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)

In [None]:
x_val = tok.texts_to_sequences(x_val)
x_val =pad_sequences(x_val,maxlen=maxlen)
test_pred = model.predict(x_test)
val_pred = model.predict(x_val)

In [None]:
from sklearn.metrics import f1_score
f1_list = []
for pred, true in zip(val_pred, y_val):
    F1 = f1_score(np.argmax(pred, axis=1), np.argmax(true, axis=1),average='macro')
    print(F1)
    f1_list.append(F1)
    

In [None]:
np.mean(f1_list)

In [None]:
def load_data_from_csv(file_name, header=0, encoding="utf-8"):

    data_df = pd.read_csv(file_name, header=header, encoding=encoding)

    return data_df

In [None]:
test = load_data_from_csv("../inputs/sentiment_analysis_testa.csv")
for pred, column in zip(test_pred, columns):
    test[column] = np.argmax(pred, axis=1) - 2

In [None]:
test

In [None]:
test.to_csv("../output/bigru-cnn-pooling3.csv", encoding="utf_8_sig", index=False)