In [18]:
#Import Library
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests

In [21]:
#global variable
NUM_WORDS = 1000
OOV_TOKEN = "<OOV>"
PADDING = 'post'
MAXLEN = 120
EMBEDDING_DIM = 16

In [3]:
#Retrieve DS from Github
dataset_link = 'https://raw.githubusercontent.com/Willie29/capstone-C23-PS056/main/ML/Dataset_Combined.csv'
response = requests.get(dataset_link)

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/Willie29/capstone-C23-PS056/main/ML/Dataset_Combined.csv')
# Identify Unnamed columns
unnamed_columns = [col for col in data.columns if 'Unnamed' in col]

# Drop Unnamed columns
data = data.drop(unnamed_columns, axis=1)

print(data.head())
print("\n")
print(data.tail())

    Label                                              Tweet
0  Non_HS  Fadli Zon Minta Mendagri Segera Menonaktifkan ...
1  Non_HS  Mereka terus melukai aksi dalam rangka memenja...
2  Non_HS  bagaimana gurbernur melakukan kekerasan peremp...
3  Non_HS  Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja...
4  Non_HS                Waspada KTP palsu.....kawal PILKADA


     Label                                       Tweet
2703    HS                          Dasar murahan kamu
2704    HS     kuliah aja tinggi, tapi otak di dengkul
2705    HS                    semoga anda masuk neraka
2706    HS                               Matilo anjing
2707    HS  Orang timur kurang pintar dari orang barat


In [5]:
data['Label'].value_counts()

Label
Non_HS    1354
HS        1354
Name: count, dtype: int64

In [6]:
def parse_data(data):
    labels = []
    tweets = []
    
    for _, row in data.iterrows():
        cond = (0 if row['Label'] == "HS" else 1)
        labels.append(cond)
        tweets.append(row['Tweet'])
            
    return labels, tweets

# Assuming 'data' is your DataFrame
labels, tweets = parse_data(data)

In [7]:
print(f"Example number in dataset is {len(tweets)} examples\n")

print(f"2nd example:\n{tweets[1]}\n")
print(f"Last example:\n{tweets[-1]}")

Example number in dataset is 2708 examples

2nd example:
Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.

Last example:
Orang timur kurang pintar dari orang barat


In [8]:
#90-10 ratio train-test
def train_test_split(labels,tweets):
    train_size = int(len(tweets) * 0.9)

    train_labels = labels[:train_size]
    train_tweets = tweets[:train_size]

    test_labels = labels[train_size:]
    test_tweets = tweets[train_size:]
    
    return train_labels, train_tweets, test_labels, test_tweets

In [9]:
train_labels, train_tweets, test_labels, test_tweets = train_test_split(labels, tweets)

print(f" {len(train_labels)} sentences for training.")
print(f" {len(train_tweets)} labels for training.")
print(f" {len(test_labels)} sentences for validation.")
print(f" {len(test_tweets)} labels for validation.")

 2437 sentences for training.
 2437 labels for training.
 271 sentences for validation.
 271 labels for validation.


In [10]:
print(train_tweets[:3])
print(train_labels[:3])
print(test_tweets[:3])
print(test_labels[:3])

['Fadli Zon Minta Mendagri Segera Menonaktifkan Ahok Jadi Gubernur DKI', 'Mereka terus melukai aksi dalam rangka memenjarakan Ahok atau Ahok gagal dalam Pilkada.', 'bagaimana gurbernur melakukan kekerasan perempuan? Buktinya banyak ibu2 mau foto bareng #DebatFinalPilkadaJKT']
[1, 1, 1]
['gara gara ini negara kita diketawain negara sebelah aduuuh emang iq yang buat tinggi2 yaa ngga kuat gue', 'kasian para bapak bangsa yang merumuskan pancasila mereka pasti sedih klo tau ternyata generasi pancasila nya spt itu', 'mereka gak kaya seperti lu yang gak pernah susah dapat makan']
[0, 0, 0]


In [11]:
#tokenizer function
def fit_tokenizer(train_sentences, num_words, oov_token):
    tokenizer = Tokenizer(num_words = num_words, oov_token = oov_token)
    tokenizer.fit_on_texts(train_sentences)
    return tokenizer


In [12]:
#buat test doang
def lowercase(list_sentence):
    lower_sentence = list_sentence
    for i in range(len(list_sentence)):
        lower_sentence[i] = lower_sentence[i].lower()
    return lower_sentence

In [15]:
#tokenize sentence
#test_tweets1 = test_tweets[0].lower()
#test_tweets1 = test_tweets1.lower()
lower_train_tweets = lowercase(train_tweets)
tokenizer = fit_tokenizer(lower_train_tweets, NUM_WORDS, OOV_TOKEN)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 7765 words

<OOV> token included in vocabulary


In [14]:
#seq and padding function
def seq_and_pad(sentences, tokenizer, padding, maxlen):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen = maxlen, padding = padding)
    return padded_sequences

In [19]:
#seq and padding sentence
lower_val_tweets = lowercase(test_tweets)
train_padded_seq = seq_and_pad(lower_train_tweets, tokenizer, PADDING, MAXLEN)
val_padded_seq = seq_and_pad(lower_val_tweets, tokenizer, PADDING, MAXLEN)
print(f"Padded training sequences have shape: {train_padded_seq.shape}\n")
print(f"Padded validation sequences have shape: {val_padded_seq.shape}")

Padded training sequences have shape: (2437, 120)

Padded validation sequences have shape: (271, 120)


In [23]:
#Model Structure
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length = MAXLEN),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(units = 32, activation= 'relu'),
    tf.keras.layers.Dense(units = 2, activation='softmax')
])

In [26]:
print(type(test_labels))

<class 'list'>


In [25]:
#Compile Model
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam() ,metrics=['accuracy'])

history = model.fit(train_padded_seq, train_labels, epochs=30, validation_data=(val_padded_seq, test_labels))

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

In [None]:
#gak jalan gara gara labelnya list bukan numpy array, jadi antara di tokenize jadi otomatis keubah atau diubah ke np array secara manual