# Data Prediction by ALBERT

## Import packages

In [14]:
import re
import numpy as np
import pandas as pd

import pipline

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras

import sentencepiece as spm
import tokenization

In [2]:
import os
os.environ["TFHUB_CACHE_DIR"] = "E:\\Python\\Jupyter\\[Shopee]Competes\\sentiment_analysis\\tfhub_cache"

In [3]:
src = 'data/cleaned_data'

## Define Constants

In [4]:
INPUT_SEQUENCE_LEN = 1600
EPOCHS_COUNT = 20
MAX_VOCAB = 62000

## Read data

In [5]:
train = pd.read_csv(f'{src}/train_sol2415.csv')
test = pd.read_csv(f'{src}/test_sol241.csv')

In [6]:
train['review'] = train['review'].astype(str)
test['review'] = test['review'].astype(str)

## Data processing

In [7]:
# Define the labels
train['label'] = [0]*len(train)
train.loc[train[train['rating'] == 4].index, 'label'] = 1
train.loc[train[train['rating'] == 5].index, 'label'] = 2
train

Unnamed: 0,review_id,review,rating,label
0,0,ga disappointed neat products meletot hilsnyaa...,1,0
1,1,rdtanya replace broken glass broken chargernya,1,0
2,2,nyesel bngt dsni shopping antecedent photo mes...,1,0
3,3,sent a light blue suit goods ga want a refund,1,0
4,4,pendants came with dents and scratches on its ...,1,0
...,...,...,...,...
146806,146806,excellent product quality delivery speed is ve...,5,2
146807,146807,thanks gan,5,2
146808,146808,awesome awesome quality merchandise value cp v...,5,2
146809,146809,nice packing boxes made effective price fast s...,5,2


In [8]:
# Trim spaces
train['review'] = train['review'].apply(lambda text: ' '.join(text.split()))

In [None]:
# get fitting data
trainfit = pipline.FittingData(train, training_col='review', label_col='label', max_vocab=MAX_VOCAB, seq_len=INPUT_SEQUENCE_LEN)
train_x, test_x, train_y, test_y = trainfit.get_fitting_data()
print(len(train_x), len(test_x))

## Build ALBERT Pre-trained Model

In [9]:
module = hub.KerasLayer('https://tfhub.dev/google/albert_base/2', trainable=False, signature='tokens')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [15]:
sp = spm.SentencePieceProcessor()
MODEL_PATH = 'tfhub_cache\\098d91f064a4f53dffc7633d00c3d8e87f3a4716\\assets\\30k-clean.model'
sp.load(MODEL_PATH)

True

In [16]:
vocabs = [[sp.id_to_piece(id), id] for id in range(sp.get_piece_size())]
vocabs

[['<pad>', 0],
 ['<unk>', 1],
 ['[CLS]', 2],
 ['[SEP]', 3],
 ['[MASK]', 4],
 ['(', 5],
 [')', 6],
 ['"', 7],
 ['-', 8],
 ['.', 9],
 ['–', 10],
 ['£', 11],
 ['€', 12],
 ['▁', 13],
 ['▁the', 14],
 [',', 15],
 ['▁of', 16],
 ['▁and', 17],
 ['s', 18],
 ['▁in', 19],
 ['▁to', 20],
 ['▁a', 21],
 ["'", 22],
 ['▁was', 23],
 ['▁he', 24],
 ['▁is', 25],
 ['▁for', 26],
 ['▁on', 27],
 ['▁as', 28],
 ['▁with', 29],
 ['▁that', 30],
 ['▁i', 31],
 ['▁it', 32],
 ['▁his', 33],
 ['▁by', 34],
 ['▁at', 35],
 ['▁her', 36],
 ['▁from', 37],
 ['t', 38],
 ['▁she', 39],
 ['▁an', 40],
 ['▁had', 41],
 ['▁you', 42],
 ['d', 43],
 ['▁be', 44],
 [':', 45],
 ['▁were', 46],
 ['▁but', 47],
 ['▁this', 48],
 ['i', 49],
 ['▁are', 50],
 ['▁my', 51],
 ['▁not', 52],
 ['▁one', 53],
 ['▁or', 54],
 ['▁me', 55],
 ['▁which', 56],
 ['▁have', 57],
 ['a', 58],
 ['▁they', 59],
 ['?', 60],
 ['▁him', 61],
 ['e', 62],
 ['▁has', 63],
 ['▁first', 64],
 ['▁all', 65],
 ['▁their', 66],
 ['▁also', 67],
 ['ing', 68],
 ['ed', 69],
 ['▁out', 70],
 ['▁