In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load the dataset :

In [2]:
import pandas as pd

file_path = "/content/drive/MyDrive/DistilBERT-7-Mei/combined_dataset.csv"
df = pd.read_csv(file_path)
print(df.head())

   Unnamed: 0         Label  \
0           0  Non-bullying   
1           1  Non-bullying   
2           2      Bullying   
3           3  Non-bullying   
4           4  Non-bullying   

                                          clean_text  \
0       kaka tidur yaa sudah pagi tidak boleh capek2   
1                    makan nasi padang saja badannya   
2                         suka cukur jembut manggung   
3  hai kak isyana ngefans sekali kak isyana suka ...   
4             manusia bidadari sih herann deh cantik   

                                              String  encoded_label  
0        "Kaka tidur yaa, udah pagi, gaboleh capek2"            1.0  
1            "makan nasi padang aja begini badannya"            1.0  
2  "yang aku suka dari dia adalah selalu cukur je...            0.0  
3  "Hai kak Isyana aku ngefans banget sama kak Is...            1.0  
4  "Manusia apa bidadari sih herann deh cantik te...            1.0  


In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Label,clean_text,String,encoded_label
0,0,Non-bullying,kaka tidur yaa sudah pagi tidak boleh capek2,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",1.0
1,1,Non-bullying,makan nasi padang saja badannya,"""makan nasi padang aja begini badannya""",1.0
2,2,Bullying,suka cukur jembut manggung,"""yang aku suka dari dia adalah selalu cukur je...",0.0
3,3,Non-bullying,hai kak isyana ngefans sekali kak isyana suka ...,"""Hai kak Isyana aku ngefans banget sama kak Is...",1.0
4,4,Non-bullying,manusia bidadari sih herann deh cantik,"""Manusia apa bidadari sih herann deh cantik te...",1.0
5,5,Bullying,ayu kinantii isyan sekarang berubah ya baju ny...,"""@ayu.kinantii isyan skrg berubah ya:( baju ny...",0.0
6,6,Non-bullying,gemesnya isyan mirip tango berlapis lapis ciaaaa,"""Gemesnya isyan kayak tango, berlapis lapis ci...",1.0
7,7,Bullying,jelek saja anaknya ayahnya cakep2,"""Makin jelek aja anaknya, padahal ibu ayahnya ...",0.0
8,8,Bullying,anaknya mirip sudah tua begitu ya mukanya kart...,"""Kok anaknya kayak udah tua gitu ya mukanya kk...",0.0
9,9,Bullying,muka anak nya ko tua sekali yaa tidak ngegemes...,"""Muka anak nya ko tua banget yaa.. GK ngegemes...",0.0


# Persiapan Lingkungan

In [4]:
!pip install gensim tensorflow numpy pandas scikit-learn




# Import Library

In [5]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


# Split Data Train-Test

In [6]:
from sklearn.preprocessing import LabelEncoder

X = df['clean_text'].astype(str)
y = df['encoded_label']

# Initialize and fit LabelEncoder to get class names from original labels
le = LabelEncoder()
le.fit(df['Label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisasi & Padding

In [7]:
max_words = 20000   # jumlah kata unik maksimal
max_len = 50        # panjang maksimum sequence (padding)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


# Download & Load Pre-trained FastText Bahasa Indonesia

FastText menyediakan pre-trained embeddings untuk >150 bahasa.
Untuk Bahasa Indonesia: cc.id.300.vec.gz

In [8]:
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
!gunzip cc.id.300.vec.gz


--2025-09-17 02:54:52--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.108, 3.163.189.96, 3.163.189.14, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1227018698 (1.1G) [binary/octet-stream]
Saving to: ‘cc.id.300.vec.gz’


2025-09-17 02:55:13 (56.5 MB/s) - ‘cc.id.300.vec.gz’ saved [1227018698/1227018698]



# Load ke Gensim:

In [9]:
from gensim.models import KeyedVectors

fasttext_model = KeyedVectors.load_word2vec_format('cc.id.300.vec', binary=False)
embedding_dim = 300  # ukuran embedding dari fastText


# Buat Embedding Matrix

In [10]:
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]


# Bangun Model Bi-GRU

In [11]:
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))  # pre-trained tidak di-train ulang
model.add(Bidirectional(GRU(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




# Training Model

In [12]:
history = model.fit(X_train_pad, y_train,
                    validation_split=0.2,
                    epochs=10,
                    batch_size=32,
                    verbose=1)


Epoch 1/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.6116 - loss: 0.6563 - val_accuracy: 0.8036 - val_loss: 0.4400
Epoch 2/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8211 - loss: 0.4185 - val_accuracy: 0.7976 - val_loss: 0.4144
Epoch 3/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8623 - loss: 0.3258 - val_accuracy: 0.8399 - val_loss: 0.3797
Epoch 4/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8798 - loss: 0.3015 - val_accuracy: 0.8278 - val_loss: 0.3979
Epoch 5/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8860 - loss: 0.2822 - val_accuracy: 0.8369 - val_loss: 0.3935
Epoch 6/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9095 - loss: 0.2378 - val_accuracy: 0.8489 - val_loss: 0.3884
Epoch 7/10
[1m42/42[0m [32m━━━━

# Evaluasi Model

In [13]:
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

# Get the class names for labels 0 and 1
class_names = le.inverse_transform([0, 1])

print(classification_report(y_test, y_pred, target_names=class_names))

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
              precision    recall  f1-score   support

    Bullying       0.83      0.92      0.87       219
Non-bullying       0.90      0.79      0.84       195

    accuracy                           0.86       414
   macro avg       0.87      0.86      0.86       414
weighted avg       0.86      0.86      0.86       414



# Menyimpan Model dan Objek Pendukung (Tokenizer)

In [14]:
import pickle
from tensorflow.keras.models import save_model

# 1. Save model Bi-GRU
model.save("bi_gru_cyberbullying.h5")  # format HDF5

# 2. Save tokenizer atau word_to_index
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)  # jika pakai tokenizer




In [34]:
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

# ==== 1. Function Preprocessing Same as Training Data ====
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Reusing the preprocess_text function from cell pNmyNKxR6Nnu
def preprocess_text(text, tokenizer, max_len):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    return padded


# ==== 2. Load Tokenizer & Trained Model ====
import pickle
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the model with the correct filename
model = load_model('bi_gru_cyberbullying.h5')

# Ensure maxlen is the same as during training
MAXLEN = 50 # Corrected from 100 to 50

# Reusing the predict_text function from cell OO3dpisz6U7F
def predict_text(model, tokenizer, text, max_len):
    processed = preprocess_text(text, tokenizer, max_len)
    # Assuming the model outputs probabilities for binary classification
    # and the output shape is (batch_size, 1) with sigmoid activation
    prob = model.predict(processed)
    # The prediction should be based on the probability threshold (e.g., 0.5)
    pred = (prob > 0.5).astype(int)[0][0]
    return pred, prob.flatten()[0] # Return single probability for binary case


# ==== 3. Manual Testing with Full Pipeline ====
# Using the predict_text function with the loaded model, tokenizer, and correct max_len
test_text1 = "astaghfirullah seram ya bun ular jelmaan makhluk halus ya bun"
pred1, prob1 = predict_text(model, tokenizer, test_text1, MAXLEN)

print(f"Teks asli: {test_text1}")
print(f"Teks bersih: {clean_text(test_text1)}")
print(f"Prediksi: {pred1} (0=non-cyberbully, 1=cyberbully)")
print(f"Probabilitas: {prob1}")

print("-" * 20)

test_text2 = "Tolol banget lu anjing"
pred2, prob2 = predict_text(model, tokenizer, test_text2, MAXLEN)

print(f"Teks asli: {test_text2}")
print(f"Teks bersih: {clean_text(test_text2)}")
print(f"Prediksi: {pred2} (0=non-cyberbully, 1=cyberbully)")
print(f"Probabilitas: {prob2}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
Teks asli: astaghfirullah seram ya bun ular jelmaan makhluk halus ya bun
Teks bersih: astaghfirullah seram ya bun ular jelmaan makhluk halus ya bun
Prediksi: 1 (0=non-cyberbully, 1=cyberbully)
Probabilitas: 0.5164521932601929
--------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Teks asli: Tolol banget lu anjing
Teks bersih: tolol banget lu anjing
Prediksi: 1 (0=non-cyberbully, 1=cyberbully)
Probabilitas: 0.5312594771385193


# Load Model dan Tokenizer untuk Prediksi Manual

In [16]:
import re
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ===== Load Model dan Tokenizer =====
model = load_model("bi_gru_cyberbullying.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

MAX_LEN = 50  # Harus sama seperti saat training

# ===== Preprocessing =====
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # hanya huruf
    text = re.sub(r"\s+", " ", text).strip()  # hapus spasi ekstra
    return text

def preprocess_text(text, tokenizer, max_len):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='pre', truncating='pre')  # samakan dengan training
    return padded

# ===== Predict Function =====
def predict_text(model, tokenizer, text, max_len):
    processed = preprocess_text(text, tokenizer, max_len)
    prob = model.predict(processed)[0][0]  # output sigmoid, shape=(1,)
    pred = 1 if prob >= 0.5 else 0        # threshold 0.5
    return pred, prob

# ===== Contoh Prediksi =====
test_texts = [
    "jelek saja anaknya ayahnya cakep2",
    "kamu bodoh banget",
    "muka anak nya ko tua sekali yaa tidak ngegemes.."
]

for t in test_texts:
    pred, prob = predict_text(model, tokenizer, t, MAX_LEN)
    print(f"Teks asli: {t}")
    print(f"Teks bersih: {clean_text(t)}")
    print(f"Prediksi: {pred} (0=non-cyberbully, 1=cyberbully)")
    print(f"Probabilitas: {prob:.4f}")
    print("-"*30)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step
Teks asli: jelek saja anaknya ayahnya cakep2
Teks bersih: jelek saja anaknya ayahnya cakep
Prediksi: 0 (0=non-cyberbully, 1=cyberbully)
Probabilitas: 0.0311
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Teks asli: kamu bodoh banget
Teks bersih: kamu bodoh banget
Prediksi: 0 (0=non-cyberbully, 1=cyberbully)
Probabilitas: 0.0119
------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Teks asli: muka anak nya ko tua sekali yaa tidak ngegemes..
Teks bersih: muka anak nya ko tua sekali yaa tidak ngegemes
Prediksi: 0 (0=non-cyberbully, 1=cyberbully)
Probabilitas: 0.0228
------------------------------


In [17]:
from sklearn.metrics import confusion_matrix
y_pred_train = (model.predict(X_train_pad) > 0.5).astype(int)
print(confusion_matrix(y_train, y_pred_train))



[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[[892  25]
 [ 81 654]]


In [18]:
# # 1. Set konfigurasi Git global
# !git config --global user.email "immanuel.leonsalomo@gmail.com"
# !git config --global user.name "LeonsMetanoia"

# # 2. Clone repo teman
# !git clone https://github.com/WilliamAxelC/Indonesian-Cyberbullying-Detection-with-Distilbert.git

# # 3. Copy notebook .ipynb kamu ke dalam folder repo
# import shutil
# import os

# # Get the path of the current notebook
# # In Colab, __file__ might not work as expected. A common way is to use the notebook path from the environment.
# # However, directly getting the current notebook's path in a robust way within a script cell is tricky.
# # Assuming the notebook is in the default content directory or you know its name:
# notebook_name = "Bi-GRU-Updated August.ipynb" # Replace with your actual notebook name if different
# notebook_path = f"/content/{notebook_name}"

# # Check if the notebook exists before copying
# if os.path.exists(notebook_path):
#     shutil.copy(notebook_path, f"/content/Indonesian-Cyberbullying-Detection-with-Distilbert/{notebook_name}")
#     print(f"Copied {notebook_name} to the repository folder.")
# else:
#     # If the common path doesn't work, you might need to manually specify or find the path.
#     # For now, we'll print an error and stop.
#     print(f"Error: Notebook file not found at {notebook_path}. Please check the notebook name and path.")
#     # Exit or handle the error appropriately if the file is critical.
#     # For this example, we'll continue to the next steps, but the copy will have failed.


# # 4. Commit dan push perubahan
# # Make sure the repository was cloned successfully before changing directory
# repo_dir = "/content/Indonesian-Cyberbullying-Detection-with-Distilbert"
# if os.path.exists(repo_dir):
#     %cd {repo_dir}

#     # Add and commit if the notebook was successfully copied or other changes exist
#     # Check if there are changes to add before adding and committing
#     git_status_output = !git status --porcelain
#     if git_status_output:
#         !git add .
#         !git commit -m "Add notebook from Colab"
#         print("Changes committed.")
#     else:
#         print("No changes to commit.")


#     # 5. Push to GitHub with token authentication
#     from google.colab import userdata
#     try:
#         github_token = userdata.get('GITHUB_TOKEN')
#         if github_token:
#              # Use the token in the push URL
#             !git push https://LeonsMetanoia:{github_token}@github.com/WilliamAxelC/Indonesian-Cyberbullying-Detection-with-Distilbert.git main
#             print("Push successful!")
#         else:
#             print("Error: GITHUB_TOKEN not found in Colab secrets.")
#             print("Please add your GitHub Personal Access Token to Colab secrets with the name 'GITHUB_TOKEN'.")
#     except Exception as e:
#         print(f"An error occurred during the push: {e}")

# else:
#     print(f"Error: Repository directory not found at {repo_dir}. Cloning might have failed.")

In [35]:
!git clone https://github.com/WilliamAxelC/Indonesian-Cyberbullying-Detection-with-Distilbert.git


Cloning into 'Indonesian-Cyberbullying-Detection-with-Distilbert'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 80 (delta 38), reused 69 (delta 27), pack-reused 0 (from 0)[K
Receiving objects: 100% (80/80), 11.16 MiB | 12.61 MiB/s, done.
Resolving deltas: 100% (38/38), done.


In [41]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# # Simpan notebook aktif ke file .ipynb
# from google.colab import drive
# import IPython

# # notebook_name = "Bi-GRU-Updated August.ipynb"
# save_path = f"/content/{notebook_name}"

# # Simpan manual notebook ke file .ipynb
# IPython.notebook.export_notebook(save_path)
# print(f"✅ Notebook disimpan ke {save_path}")


In [20]:
# # =========================================
# # 1. Set konfigurasi Git global
# # =========================================
# !git config --global user.email "immanuel.leonsalomo@gmail.com"
# !git config --global user.name "LeonsMetanoia"

# # =========================================
# # 2. Clone repo teman
# # =========================================
# import os

# repo_dir_name = "Indonesian-Cyberbullying-Detection-with-Distilbert"
# repo_dir_path = f"/content/{repo_dir_name}"

# if not os.path.exists(repo_dir_path):
#     !git clone https://github.com/WilliamAxelC/Indonesian-Cyberbullying-Detection-with-Distilbert.git
#     print(f"✅ Cloned repository to {repo_dir_path}")
# else:
#     print(f"⚠️ Repository already exists at {repo_dir_path}. Skipping clone.")

# # =========================================
# # 3. Copy notebook ke dalam repo
# # =========================================
# import shutil

# notebook_name = "Bi-GRU-Updated August.ipynb"  # nama notebook di Colab
# src_path = f"/content/{notebook_name}"

# # Biar aman untuk GitHub → ganti spasi dengan "-"
# dst_name = notebook_name.replace(" ", "-")
# dst_path = f"{repo_dir_path}/{dst_name}"

# if os.path.exists(src_path):
#     shutil.copy(src_path, dst_path)
#     print(f"✅ Copied {notebook_name} → {dst_name} in repo folder.")
# else:
#     raise FileNotFoundError(f"❌ Notebook tidak ditemukan di {src_path}. Pastikan nama file benar.")

# # =========================================
# # 4. Commit perubahan
# # =========================================
# %cd {repo_dir_path}

# git_status_output = !git status --porcelain
# if git_status_output:
#     !git add .
#     !git commit -m "Add {dst_name}"  # commit message pakai nama file baru
#     print("✅ Changes committed.")
# else:
#     print("⚠️ Tidak ada perubahan untuk di-commit.")

# # =========================================
# # 5. Push ke GitHub pakai token dari Colab Secrets
# # =========================================
# from google.colab import userdata

# github_token = userdata.get('GITHUB_TOKEN')
# if github_token:
#     push_url = f"https://LeonsMetanoia:{github_token}@github.com/WilliamAxelC/Indonesian-Cyberbullying-Detection-with-Distilbert.git"
#     !git push {push_url} main
#     print("✅ Push berhasil!")
# else:
#     print("❌ Error: GITHUB_TOKEN tidak ditemukan di Colab secrets.")
#     print("Tambahkan token kamu ke Colab (Menu: Runtime → RunTime settings → Secrets).")
