# Text CNN Sentiment
### 1. import dependences

In [1]:
import math
import numpy as np
import pandas as pd
import os
import math
import random
import codecs
from pathlib import Path

import mindspore
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor
from mindspore import context
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.ops import operations as ops

from easydict import EasyDict as edict

cfg = edict({
    'name': 'movie review',
    'pre_trained': False,
    'num_classes': 2,
    'batch_size': 64,
    'epoch_size': 4,
    'weight_decay': 3e-5,
    'data_path': './data/',
    'device_target': 'CPU',
    'device_id': 0,
    'keep_checkpoint_max': 1,
    'checkpoint_path': './ckpt/train_textcnn-4_149.ckpt',
    'word_len': 51,
    'vec_length': 40
})

context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id)

### 2. Load Dataset

In [2]:
with open("./data/rt-polarity.neg", 'r', encoding='utf-8') as f:
        print("Negative reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))
with open("./data/rt-polarity.pos", 'r', encoding='utf-8') as f:
        print("Positive reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))

Negative reivews:
[0]:simplistic , silly and tedious . 

[1]:it's so laddish and juvenile , only teenage boys could possibly find it funny . 

[2]:exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

[3]:[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

[4]:a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 

Positive reivews:
[0]:the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

[1]:the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

[2]:effective but too-tepid biopic

### 3. Define data generation class

In [3]:
class Generator():
    def __init__(self, input_list):
        self.input_list = input_list
        
    def __getitem__(self, item):
        return (np.array(self.input_list[item][0], dtype=np.int32),
                np.array(self.input_list[item][1], dtype=np.int32))
    
    def __len__(self):
        return len(self.input_list)


# âœ… Step 2: Define MovieReview
class MovieReview:
    '''Movie review dataset'''
    
    def __init__(self, root_dir, maxlen, split):
        self.path = root_dir
        self.feelMap = {'neg': 0, 'pos': 1}
        self.files = []
        self.doConvert = False
        
        mypath = Path(self.path)
        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        for root, _, filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root, each))
            break

        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        self.Pos = []
        self.Neg = []
        
        for filename in self.files:
            self.read_data(filename)

        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)

    def read_data(self, filePath):
        with open(filePath, 'r') as f:
            for sentence in f.readlines():
                sentence = sentence.replace('\n', '')\
                                .replace('"', '')\
                                .replace('\'', '')\
                                .replace('.', '')\
                                .replace(',', '')\
                                .replace('[', '')\
                                .replace(']', '')\
                                .replace('(', '')\
                                .replace(')', '')\
                                .replace(':', '')\
                                .replace('--', '')\
                                .replace('-', ' ')\
                                .replace('\\', '')\
                                .replace('0', '')\
                                .replace('1', '')\
                                .replace('2', '')\
                                .replace('3', '')\
                                .replace('4', '')\
                                .replace('5', '')\
                                .replace('6', '')\
                                .replace('7', '')\
                                .replace('8', '')\
                                .replace('9', '')\
                                .replace('`', '')\
                                .replace('=', '')\
                                .replace('$', '')\
                                .replace('/', '')\
                                .replace('*', '')\
                                .replace(';', '')\
                                .replace('<b>', '')\
                                .replace('%', '')

                sentence = sentence.split(' ')
                sentence = list(filter(lambda x: x, sentence))
                
                if sentence:
                    self.word_num += len(sentence)
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                    
                    if 'pos' in filePath:
                        self.Pos.append([sentence, self.feelMap['pos']])
                    else:
                        self.Neg.append([sentence, self.feelMap['neg']])

    def text2vec(self, maxlen):
        self.Vocab = dict()

        for SentenceLabel in self.Pos + self.Neg:
            vector = [0] * maxlen
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:
                    break
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1
                else:
                    vector[index] = self.Vocab[word]
            SentenceLabel[0] = vector
        self.doConvert = True

    def split_dataset(self, split):
        trunk_pos_size = math.ceil((1 - split) * len(self.Pos))
        trunk_neg_size = math.ceil((1 - split) * len(self.Neg))
        trunk_num = int(1 / (1 - split))
        pos_temp = list()
        neg_temp = list()
        
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index * trunk_pos_size:(index + 1) * trunk_pos_size])
            neg_temp.append(self.Neg[index * trunk_neg_size:(index + 1) * trunk_neg_size])
        
        self.test = pos_temp.pop(2) + neg_temp.pop(2)
        self.train = [i for item in pos_temp + neg_temp for i in item]
        random.shuffle(self.train)

    def get_dict_len(self):
        if self.doConvert:
            return len(self.Vocab)
        else:
            print("Haven't finished Text2Vec")
            return -1

    def create_train_dataset(self, epoch_size, batch_size):
        dataset = ds.GeneratorDataset(
            source=Generator(input_list=self.train), 
            column_names=["data", "label"], 
            shuffle=False
        )
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
        dataset = dataset.repeat(epoch_size)
        return dataset

    def create_test_dataset(self, batch_size):
        dataset = ds.GeneratorDataset(
            source=Generator(input_list=self.test), 
            column_names=["data", "label"], 
            shuffle=False
        )
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
        return dataset

In [4]:
instance = MovieReview(root_dir=cfg.data_path, maxlen=cfg.word_len, split=0.9)
dataset = instance.create_train_dataset(batch_size=cfg.batch_size,epoch_size=cfg.epoch_size)
batch_num = dataset.get_dataset_size()

In [5]:
vocab_size=instance.get_dict_len()
print("vocab_size:{0}".format(vocab_size))
item =dataset.create_dict_iterator()
for i,data in enumerate(item):
    if i<1:
        print(data)
        print(data['data'][1])
    else:
        break

vocab_size:18848
{'data': Tensor(shape=[64, 51], dtype=Int32, value=
[[ 1090,   411,  1329 ...     0,     0,     0],
 [  339,   508,   509 ...     0,     0,     0],
 [15496, 15497,  2049 ...     0,     0,     0],
 ...
 [  128,    15, 17940 ...     0,     0,     0],
 [ 5970, 13894,  3470 ...     0,     0,     0],
 [ 2031,   160,    90 ...     0,     0,     0]]), 'label': Tensor(shape=[64], dtype=Int32, value= [1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 
 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 
 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1])}
[ 339  508  509 2169  152    4   14   32   27 2170 1865 2171  359   11
 2172 2173 2054  253 2174  180   32   82    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0]


In [6]:
learning_rate = []
warm_up = [1e-3 / math.floor(cfg.epoch_size / 5) * (i + 1) for _ in range(batch_num) 
           for i in range(math.floor(cfg.epoch_size / 5))]
shrink = [1e-3 / (16 * (i + 1)) for _ in range(batch_num) 
          for i in range(math.floor(cfg.epoch_size * 3 / 5))]
normal_run = [1e-3 for _ in range(batch_num) for i in 
              range(cfg.epoch_size - math.floor(cfg.epoch_size / 5) 
                    - math.floor(cfg.epoch_size * 2 / 5))]
learning_rate = learning_rate + warm_up + normal_run + shrink