# Text Clasification with Non Contextual Word Embedding (Word2Vec) Deep Learning Algorithm 

In [171]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from sklearn.metrics import classification_report, accuracy_score
from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

### Read Data

In [172]:
train_data = pd.read_csv("./data_worthcheck/train.csv")
test_data = pd.read_csv("./data_worthcheck/test.csv")
train_data["label"] = train_data["label"].map({'no': 0, 'yes':1})
test_data["label"] = test_data["label"].map({'no': 0, 'yes':1})

## Data Preprocesing

In [173]:
print("TRAIN DATA")
len_train_0 = len(train_data[train_data["label"] == 0])
len_train_1 = len(train_data[train_data["label"] == 1])
len_train = len(train_data)
print("NO: ", len_train_0)
print("YES: ", len_train_1)
print("NO + YES = ", len_train_0 + len_train_1)
print("TOTAL: ", len_train)

TRAIN DATA
NO:  15512
YES:  6089
NO + YES =  21601
TOTAL:  21601


In [174]:
print("TEST DATA")
len_test_0 = len(test_data[test_data["label"] == 0])
len_test_1 = len(test_data[test_data["label"] == 1])
len_test = len(test_data)
print("NO: ", len_test_0)
print("YES: ", len_test_1)
print("NO + YES = ", len_test_0 + len_test_1)
print("TOTAL: ", len_test)

TEST DATA
NO:  2093
YES:  707
NO + YES =  2800
TOTAL:  2800


In [175]:
# Get Indonesian Stopwords from nltk
indonesian_stopwords = stopwords.words('indonesian')

In [176]:
# Removing Stopwords and Tokenizing

train_stop_removed = []
test_stop_removed = []

# Train
for sentence in train_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    train_stop_removed.append(sentence_stop_removed)

# Test
for sentence in test_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    test_stop_removed.append(sentence_stop_removed)


## Model

### Word2Vec Model

In [177]:
# Word 2 Vector
word2vec = gensim.models.Word2Vec(
    train_stop_removed,
    window=8,
    min_count=3,
    workers=3,
    sg=1
)


### Defining Functions for Vectorizing and Pading

In [178]:
# vectonizer
from cmath import exp


def vectorize(tokenized, word2vec):
    vectorized = []
    for sentence in tokenized:
        sent_vec = []
        for w in sentence:
            if w in word2vec.wv.key_to_index:
                sent_vec.append(word2vec.wv[w])
            else:
                sent_vec.append(np.zeros(100))
        vectorized.append(sent_vec)
    return vectorized

# padding
def padder(vectorized, max_length):
    padded = []
    for i, v in enumerate(vectorized):
        vec = []
        if len(v) < max_length:
            pad_count = max_length - len(v)
            pad = np.zeros((pad_count, 100))
            vec = np.append(v, pad, axis=0)
        else:
            vec = v[:max_length]
        padded.append(vec)
    return padded

# result
def class_assigner(x):
    if x>0.25:
        return 1
    else:
        return 0
    

### Vectorizing and Padding Train and Test Data

In [179]:
max_length = 50
X_train = padder(vectorize(train_stop_removed, word2vec), max_length=max_length)
X_train = np.array(X_train)

X_test = padder(vectorize(test_stop_removed, word2vec), max_length=max_length)
X_test = np.array(X_test)

y_test = np.array(test_data.label)

## Experiment 1

### Deep Learning Agent

In [180]:
# MODEL
model = keras.Sequential([
    keras.layers.LSTM(200, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    keras.layers.LSTM(100, activation='sigmoid', return_sequences=True),
    keras.layers.Dropout(0.5),
    keras.layers.LSTM(50, activation='sigmoid'),
])

model.add(keras.layers.Dense(1, activation=keras.activations.sigmoid))

In [181]:
# compile
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [182]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_36 (LSTM)              (None, 50, 200)           240800    
                                                                 
 lstm_37 (LSTM)              (None, 50, 100)           120400    
                                                                 
 dropout_18 (Dropout)        (None, 50, 100)           0         
                                                                 
 lstm_38 (LSTM)              (None, 50)                30200     
                                                                 
 dense_12 (Dense)            (None, 1)                 51        
                                                                 
Total params: 391,451
Trainable params: 391,451
Non-trainable params: 0
_________________________________________________________________


### Training

In [183]:
model.fit(X_train, train_data["label"])



<keras.callbacks.History at 0x1f04c6dfb80>

In [184]:
y_pred = model.predict(X_test)
for x in y_pred:
    x[0] = class_assigner(x[0])




In [185]:
print(y_pred)

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [1.]]


In [186]:
print("Classification report: ")
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

Classification report: 
              precision    recall  f1-score   support

          No       0.81      0.95      0.87      2093
         Yes       0.70      0.33      0.45       707

    accuracy                           0.80      2800
   macro avg       0.76      0.64      0.66      2800
weighted avg       0.78      0.80      0.77      2800



## Experiment 2

In [187]:
# Word 2 Vector
word2vec = gensim.models.Word2Vec(
    train_stop_removed,
    window=10,
    min_count=5,
    workers=3,
    sg=1
)

### Vectorizing and Padding Train and Test Data

In [188]:
max_length = 100
X_train = padder(vectorize(train_stop_removed, word2vec), max_length=max_length)
X_train = np.array(X_train)

X_test = padder(vectorize(test_stop_removed, word2vec), max_length=max_length)
X_test = np.array(X_test)

### Training

In [189]:

# MODEL
model2 = keras.Sequential([
    keras.layers.LSTM(200, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    keras.layers.LSTM(100, activation='sigmoid', return_sequences=True),
    keras.layers.Dropout(0.5),
    keras.layers.LSTM(50, activation='sigmoid'),
])

model2.add(keras.layers.Dense(1, activation=keras.activations.sigmoid))

# compile
model2.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model2.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_39 (LSTM)              (None, 100, 200)          240800    
                                                                 
 lstm_40 (LSTM)              (None, 100, 100)          120400    
                                                                 
 dropout_19 (Dropout)        (None, 100, 100)          0         
                                                                 
 lstm_41 (LSTM)              (None, 50)                30200     
                                                                 
 dense_13 (Dense)            (None, 1)                 51        
                                                                 
Total params: 391,451
Trainable params: 391,451
Non-trainable params: 0
_________________________________________________________________


In [190]:
model2.fit(X_train, train_data["label"])



<keras.callbacks.History at 0x1f0c0867fd0>

In [191]:
y_pred = model2.predict(X_test)
for x in y_pred:
    x[0] = int(class_assigner(x[0]))




In [192]:
print("Classification report: ")
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

Classification report: 
              precision    recall  f1-score   support

          No       0.00      0.00      0.00      2093
         Yes       0.25      1.00      0.40       707

    accuracy                           0.25      2800
   macro avg       0.13      0.50      0.20      2800
weighted avg       0.06      0.25      0.10      2800

