# Text Clasification with Non Contextual Word Embedding (Word2Vec) Deep Learning Algorithm 

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from tensorflow import keras

In [2]:
train_data = pd.read_csv("./data_worthcheck/train.csv")
test_data = pd.read_csv("./data_worthcheck/test.csv")
train_data["label"] = train_data["label"].map({'no': 0, 'yes':1})
test_data["label"] = test_data["label"].map({'no': 0, 'yes':1})

In [3]:
print("TRAIN DATA")
len_train_0 = len(train_data[train_data["label"] == 0])
len_train_1 = len(train_data[train_data["label"] == 1])
len_train = len(train_data)
print("NO: ", len_train_0)
print("YES: ", len_train_1)
print("NO + YES = ", len_train_0 + len_train_1)
print("TOTAL: ", len_train)

TRAIN DATA
NO:  15512
YES:  6089
NO + YES =  21601
TOTAL:  21601


In [4]:
print("TEST DATA")
len_test_0 = len(test_data[test_data["label"] == 0])
len_test_1 = len(test_data[test_data["label"] == 1])
len_test = len(test_data)
print("NO: ", len_test_0)
print("YES: ", len_test_1)
print("NO + YES = ", len_test_0 + len_test_1)
print("TOTAL: ", len_test)

TEST DATA
NO:  2093
YES:  707
NO + YES =  2800
TOTAL:  2800


## Data Preprocesing

In [5]:
indonesian_stopwords = stopwords.words('indonesian')

In [6]:
# Removing Stopwords and Tokenizing

train_stop_removed = []
test_stop_removed = []

# Train
for sentence in train_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    train_stop_removed.append(sentence_stop_removed)

# Test
for sentence in test_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    test_stop_removed.append(sentence_stop_removed)


## Model

In [7]:
# Word 2 Vector
word2vec = gensim.models.Word2Vec(
    train_stop_removed,
    window=8,
    min_count=3,
    workers=3,
    sg=1
)


In [8]:
# vectonizer
def vectorize(tokenized, word2vec):
    vectorized = []
    for sentence in tokenized:
        sent_vec = []
        for w in sentence:
            if w in word2vec.wv.key_to_index:
                sent_vec.append(word2vec.wv[w])
            else:
                sent_vec.append(np.zeros(100))
        vectorized.append(sent_vec)
    return vectorized

# padding
def padder(vectorized, max_length):
    padded = []
    for i, v in enumerate(vectorized):
        vec = []
        if len(v) < max_length:
            pad_count = max_length - len(v)
            pad = np.zeros((pad_count, 100))
            vec = np.append(v, pad, axis=0)
        else:
            vec = v[:max_length]
        padded.append(vec)
    return padded

In [9]:
max_length = 50
X_train = padder(vectorize(train_stop_removed, word2vec), max_length=max_length)
X_train = np.array(X_train)

In [10]:
max_length = 50

# MODEL
model = keras.Sequential([
    keras.layers.LSTM(200, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    keras.layers.LSTM(100, activation='sigmoid', return_sequences=True),
    keras.layers.Dropout(0.5),
    keras.layers.LSTM(50, activation='sigmoid'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

In [11]:
# compile
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)

In [12]:
test = model.fit(X_train, train_data["label"])

