In [9]:
import tensorflow as tf
import pandas as pd
import os
import numpy as np
from typing import Dict,List,Tuple
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding
from gensim.models import Word2Vec
import re as regex

array = np.array
tensor_array = tf.Tensor

In [10]:
BAKU = pd.read_csv("utils_info/kamus_alay.csv")
BAKU = BAKU.set_index("t_baku")["baku"].to_dict()
def replace_at_to_User(text: str) -> str:
    return regex.sub("@[a-zA-Z0-9]+","user",text)

def remove_tanda(text : str) -> str:
  text = regex.sub("[!\"#%$&\'@()*+,-./:;<=>?[\\]^_`{|}~]+","",text)
  return text

def remove_links(text : str) -> str:
    text = regex.sub("\S*:\S+", "",text)
    return text

def modified_has_tag(text : str)-> str:
    text =regex.sub("#", "kata kunci ", text).rstrip()
    return text

def stemming_data(text : str) -> str:
    return MPStemmer().stem_kalimat(text)

def map_to_baku(text, baku):
    text_copy = text.split()
    for i in range(len(text_copy)):
        if text_copy[i] in baku:
            text_copy[i] = baku[text_copy[i]]
    
    return " ".join(text_copy)

#%%
def preprocessing_data(text):
    text = text.lower();
    text = map_to_baku(text,BAKU)
    text = remove_links(text)
    text = replace_at_to_User(text)
    text = modified_has_tag(text)
    text = remove_tanda(text)
    #text = stemming_data(text)
    return text

In [11]:
def prepare_data(X : array , Y : array, max_features : int ,
                 max_length : int, oov_token : str="<OOV>",
                 padding_type : str = "post" )->Tuple[object,tensor_array,array ]:
    
    Y  = Y.astype(int)
    tokenizer = Tokenizer(num_words=max_features,oov_token=oov_token)
    tokenizer.fit_on_texts(X)
    x_sequences : tensor_array = tokenizer.texts_to_sequences(X)
    x_padded = pad_sequences(x_sequences, maxlen=max_length, padding=padding_type)
    Y = to_categorical(Y, num_classes=2)
    return tokenizer, x_padded, Y

def prepare_word_embeddings(raw_x : array, min_count : int, vector_size : int, window:int, sg:int,seed:int):
    raw_x : List[str]= [text.split() for text in raw_x]
    w2v_model : object = Word2Vec(raw_x, min_count=min_count, vector_size=vector_size, window=window, sg=sg, seed=seed)
    w2v_model.save(f"utils_info/gensim{vector_size}.model")
    return w2v_model
    

def createWordEmbeddings(w2v : object):
    return pd.DataFrame(w2v[w2v.wv.vocab], index=list(w2v.wv.vocab))

def createEmbeddingMatrix(tokenizer : object,w2v : object, max_features : int):
    embedding_matrix : array = np.zeros((len(tokenizer.word_index) + 1, int(max_features)))
    for word, i in tokenizer.word_index.items():
        try:
            embedding_vector = w2v.wv.get_vector(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except:
            embedding_matrix[i] = np.array([0] * max_features)
    
    return embedding_matrix

def load_word_embeddings_model(path:str):
    return Word2Vec.load(path)

def prepare_embedding_layer(tokenizer : object, max_features:int, embedding_matrix : array, train : bool =False):
    embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
    output_dim  = max_features,
    weights = [embedding_matrix],
    trainable  = train)
    
    return embedding_layer

In [12]:


#* just for naming types convetion
array = np.array
#%%
train_data = pd.read_csv("./PreprocessedDataset/indolem/train.csv")
test_data = pd.read_csv("PreprocessedDataset/indolem/test.csv")
validation_data = pd.read_csv("PreprocessedDataset/indolem/validation.csv")

train_data_cleaned = train_data["sentence"].apply(lambda x:preprocessing_data(x))  
validation_data_cleaned = validation_data["sentence"].apply(lambda x:preprocessing_data(x))

X : array = train_data_cleaned.values
Y : array = train_data["sentiment"].values
max_features :int= 10000
max_length :int= 30

#* Params For WordEmbeddings
min_count : int = 3
vector_size : int = max_length
window : int = 5	
sg : int = 1
seed : int = 0

#* Preparing the data for the model

#* Preparing WordEmbeddings For Models
if not os.path.exists("./utils.info/*.model"):
	w2v_model : object = prepare_word_embeddings(X, min_count, vector_size, window, sg, seed)
else:
	path : str = os.listdir("./utils.info/")[0]
	w2v_model : object = load_word_embeddings_model(path)


In [16]:
train_data_cleaned

0        kangen nabil user user raga user user satu aca...
1        doa untuk orang yang memberi makan iya allah b...
2        setiap kali handphone saya bunyi saya selalu b...
3        belum pernah sedekat ini wawancara dengan afga...
4        dulu masa first pergi award show amatlah malas...
                               ...                        
18192    kamar 310 pintu kamar mandi nya tidak bisa di ...
18193    tas hermes rp15 miliar dipakai belanja sayur o...
18194                  deposit terlalu besar dan kamar bau
18195    tanpa baju ruang angkasa darah seorang astrono...
18196    cemburu itu tidak enak cemburu itu makan hati ...
Name: sentence, Length: 18197, dtype: object

In [17]:
validation_data_cleaned

0             pagi user user jumat berkah buat kita semua
1       saya janji tidak akan pernah pergi dari dia se...
2       kata kunci golkar kata kunci arb banyak member...
3       tolak hamas langkah pemerintah dipuji partai d...
4       sudirman masih calon kuat ketua dewan pimpinan...
                              ...                        
1990    ac tidak dingin lantai ktor pmesanan sebelum n...
1991    seprei ada bekas noda televisi tidak berfungsi...
1992    layanannya kurang pelayannya kurang cepat tanggap
1993    ac nya kurang dingin di laman traveloka gambar...
1994    tidak mampu beli xuser dua kantor pemerintah n...
Name: sentence, Length: 1995, dtype: object

In [13]:
tokenizer,train_data_model,train_label_model = prepare_data(X, Y, max_features, max_length)
train_data_model

array([[ 819, 2733,   16, ...,    0,    0,    0],
       [1658,   17,   80, ...,    0,    0,    0],
       [ 273,  130,   92, ...,    0,    0,    0],
       ...,
       [ 663,  143,  174, ...,    0,    0,    0],
       [ 196,   79,  446, ...,    0,    0,    0],
       [1434,   40,    2, ...,    0,    0,    0]])

In [14]:
embedding_matrix : array = createEmbeddingMatrix(tokenizer,w2v_model, max_length)
embedding_matrix


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.77532756, -0.39504328, -0.03570122, ..., -0.11676707,
         0.46571836,  0.44276375],
       ...,
       [-0.38438079,  0.05596161,  0.01910686, ..., -0.0320237 ,
        -0.19091289,  0.32373103],
       [-0.4099164 ,  0.10508002,  0.13727538, ...,  0.12667809,
        -0.17868733,  0.43066597],
       [-0.39311603,  0.06210523,  0.08185337, ...,  0.0710192 ,
        -0.33428365,  0.46476793]])

In [15]:
embedding_layer = prepare_embedding_layer(tokenizer, max_length, embedding_matrix)
embedding_layer

<tensorflow.python.keras.layers.embeddings.Embedding at 0x1f557dceb38>