In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
import tensorflow as tf
import matplotlib as mpl
from cycler import cycler
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['lines.linestyle'] = '--'
import re
import string
import nltk
sns.despine()
plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

**1. Data Exploration & Preprocessing**

In [None]:
!pip install openpyxl

In [None]:
# Baca dan tampilkan data
df = pd.read_excel(r'../input/cyberbullying-bahasa-indonesia/DATASET CYBERBULLYING INSTAGRAM - FINAL.xlsx')
df

In [None]:
df = df.drop(['No.', 'Nama Instagram', 'Tanggal Posting', 'Unnamed: 6', 'Unnamed: 7'], axis=1)
df

In [None]:
#info dataset
df.info()

In [None]:
# Checking for null values
df.isnull().values.any()

In [None]:
# Separating based on different sentiments
df_cb = analysis_df[df["Kategori"]=="Non-bullying"]
df_noncb = analysis_df[df["Kategori"]=="Bullying"]

In [None]:
# Melihat jumlah masing-masing tipe
df['Kategori'].value_counts()

In [None]:
# Distribusi Target
plt.figure(figsize=(3,5))
sns.countplot(df.Kategori,palette="mako")

In [None]:
# Melihat statistical description serta histogram panjang comments Non-CB
df["Length"] = df.Komentar.str.split().apply(len)
plt.figure(figsize=(5,5))
sns.histplot(df[df["Kategori"]=="Non-bullying"]['Length'],color="g")
plt.title("Distribution of comment length for not_cyberbullying")
display(df.Length[df["Kategori"]=="Non-bullying"].describe())

In [None]:
# Melihat statistical description serta histogram panjang comments CB
df["Length"] = df.Komentar.str.split().apply(len)
plt.figure(figsize=(5,5))
sns.histplot(df[df["Kategori"]=="Bullying"]['Length'],color="r")
plt.title("Distribution of comment length for bullying")
display(df.Length[df["Kategori"]=="Bullying"].describe())

In [None]:
# variabel untuk kolom Tweet
comments = df.Komentar
print(comments.head())

In [None]:
# Case Folding
# gunakan fungsi series.str.lower() pada Pandas
comments = comments.str.lower()

print(comments.head())

In [None]:
def remove_special_signs(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
comments = comments.apply(remove_special_signs)
print(comments.head())

In [None]:
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

comments = comments.apply(remove_number)
comments.head(5)

In [None]:
# remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

comments = comments.apply(remove_punctuation)
comments.head(5)

In [None]:
#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

comments = comments.apply(remove_whitespace_LT)
print(comments.head())

In [None]:
# remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

comments = comments.apply(remove_whitespace_multiple)
print(comments.head())

In [None]:
# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

comments = comments.apply(remove_singl_char)
print(comments.head())

In [None]:
# Filtering - Stopword Removal

# Get Indonesian stopwords
list_stopwords = stopwords.words('indonesian')

# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 
                       'nyg', 'hehe', 'pen', 'nan', 'loh',
                       '&amp', 'yah'])

# read txt stopword using pandas
csv_stopword = pd.read_csv("../input/cyberbullying-bahasa-indonesia/stopwordsID.csv")

# convert stopword string to list & append additional stopword
list_stopwords.extend(csv_stopword)

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

comments = comments.apply(stopwords_removal) 


print(comments.head())

In [None]:
# Normalisasi
normalized_word = pd.read_csv("../input/cyberbullying-bahasa-indonesia/kamus_singkatan.csv")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

comments = comments.apply(normalized_term)

comments.head(10)

In [None]:
!pip install Sastrawi
!pip install Swifter

In [None]:
# import Sastrawi package
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import swifter


# create stemmer
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

# stemmed
#def stemmed_wrapper(term):
#    return stemmer.stem(term)

#term_dict = {}

#for document in comments:
#    for term in document:
#        if term not in term_dict:
#            term_dict[term] = ' '
            
#print(len(term_dict))

#for term in term_dict:
#    term_dict[term] = stemmed_wrapper(term)
#    print(term,":" ,term_dict[term])
    
#print(term_dict)

# apply stemmed term to dataframe
#def get_stemmed_term(document):
#    return [term_dict[term] for term in document]

#comments = comments.swifter.apply(get_stemmed_term)
#print(comments)

In [None]:
# convert the categorical values to numeric using the factorize() method. This returns an array of numeric values and an Index of categories.
# Inisialisasi variabel target
y = df["Kategori"]
#mengubah y menjadi integer
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)
print(y)

In [None]:
# Menghitung jumlah value integer sentimen
df["Kategori"].value_counts()

In [None]:
# Tokenisasi
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comments)

In [None]:
# Menampilkan indeks dari setiap kata dalam tweet
print(tokenizer.index_word)

In [None]:
word2vec=tokenizer.word_index
V=len(word2vec)
print('Dataset has %s number of independent tokens' %V)

In [None]:
#fit_on_texts() menciptakan hubungan antara kata-kata dan bilangan-bilangan yg di-assign
#hubungan ini disimpan dalam dictionary dalam tokenizer.word_index
#kata-kata harus diganti dengan bilangan-bilangan yang di-assign
encoded_comments = tokenizer.texts_to_sequences(comments)
print(encoded_comments)

In [None]:
#Karena setiap kalimat tidak memiliki panjang yang sama, gunakan padding
padded_sequence = pad_sequences(encoded_comments, maxlen=33, padding="post")
X = padded_sequence

**2. Membangun Text Classifier**

In [None]:
print('Shape of X is ', X.shape)
print('Shape of y is', y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size = 0.50, stratify = y)

In [None]:
x_train = x_train.astype('float32')         
x_test = x_test.astype('float32')
x_train /= 255.0              
x_test /= 255.0

In [None]:
#Create Model
embedding_vector_length = V
vocab_size = len(tokenizer.word_index) + 1
#untuk menentukan size untuk proses embedding, kalimat input sequence huruf akan diubah menjadi word embedding.
#definisi word embedding menyusul
model = tf.keras.models.Sequential() 
#layer embedding
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
#Yang digunakan adalah CNN 1-dimensi
model.add(tf.keras.layers.Conv1D(32,2,activation="relu"))
model.add(tf.keras.layers.MaxPooling1D(5))
#32 adalah filter, 8 unit convolution, 
#setelah convolution, pasti ada max pooling, max pooling 1 dimension
model.add(MaxPooling1D(2,2))
#Dropout adalah bentuk konfigurasi supaya tidak overfitting
model.add(Dropout(0.2))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.5))
model.add(MaxPooling1D(2,2))
#model LSTM
#layer utk mengubah sequence index kata menjadi vektor
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
#layer klasifikasi akhir
model.add(Dense(2, activation='softmax')) 
#Compile Model
#model.compile(loss='binary_crossentropy',optimizer='sgd', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy',optimizer='rmsprop', metrics=['accuracy'])
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
#model.compile(loss='binary_crossentropy',optimizer='adadelta', metrics=['accuracy'])
#model.compile(loss='binary_crossentropy',optimizer='ftrl', metrics=['accuracy'])
#print(model.summary())
#Coba ganti performance metric
#model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['confusion_matrix'])
#Coba jadi precision
#model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['precision'])
#loss-nya categorical karena ini termasuk klasifikasi
#kalo true-false bisa pake binary crossentropy
#Optimizer yang dipilih adalah Adam karena algoritma tersebut populer dalam bidang deep learning.
#Optimizer Adam dapat mencapai hasil yang baik dengan cepat.
#Adam sudah pernah diaplikasikan dalam dataset analisis sentimen IMDB
#Sumber: https://medium.com/@saritilawah9/adam-optimizer-80cc267522af

In [None]:
#Plotting model
plot_model(model)

In [None]:
hist = model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=30, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=40, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=60, validation_data=(x_test, y_test), shuffle=True, verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=70, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=80, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=90, validation_data=(x_test, y_test), verbose = 2)

In [None]:
hist = model.fit(x_train, y_train, epochs=100, validation_data=(x_test, y_test), verbose = 2)

In [None]:
#Plot training validation accuracy and loss
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']
loss = hist.history['loss']
val_loss = hist.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.style.use(['seaborn-whitegrid'])
mpl.rcParams['axes.prop_cycle'] = cycler(color=['r', 'b'])
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and Validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Akurasi model
accuracy = model.evaluate(x_test, y_test)
print("Accuracy of the model is: ", accuracy)

In [None]:
#Evaluasi
print("Model Performance of CNN (Test Accuracy)")
print('Accuracy: {:0.2f}%\nLoss: {:0.2f}%\n'.format(accuracy[1]*100, accuracy[0]))