In [1]:
from ipywidgets import widgets, Layout, Label
from IPython.display import display, clear_output
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import string
import unicodedata
import nltk

from termcolor import colored

In [2]:
def remove_url(str):
    str = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', str)
    return str
    
def remove_digit(str):
    str = re.sub(r'[^a-z ]*([.0-9])*\d', ' ', str)
    return str 

def remove_non_ascii(str):
    str = unicodedata.normalize('NFKD', str).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return str

def remove_twitter_char(str):  
    # mention
    str = re.sub(r'(?:@[\w_]+)', ' ', str)
    # hashtag
    str = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", " ", str)
    # RT/cc
    str = re.sub('RT', ' ', str)

    return str

def remove_punctuation(str):
    str = re.sub(r'[^\s\w]', ' ', str)
    return str

def remove_multi_space(str):
    str = re.sub('[\s]+', ' ', str)
    return str

def casefolding(str):  
    str = str.lower()   
    return ' '.join(str.split())

def remove_repeated_character(str):
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str

def normalize_slang_word(str):
    text_list = str.split(' ')
    slang_words_raw = pd.read_csv('data/add/slang_word_list.csv', sep=',', header=None)
    slang_word_dict = {}

    for item in slang_words_raw.values:
        slang_word_dict[item[0]] = item[1]

        for index in range(len(text_list)):
            if text_list[index] in slang_word_dict.keys():
                text_list[index] = slang_word_dict[text_list[index]]

    return ' '.join(text_list)

def remove_laugh(str):
    str = re.sub(r"\b(?:(h|a|e)*(?:(ha|he|hue))+h?|(?:l+o+)+l+)|(?:(w|k)*(?:wk)+(w?|k?))\b", ' ', str)
    
    return str

In [3]:
def preprocessing(str):
    str = remove_url(str)
    str = remove_twitter_char(str)
    str = remove_digit(str)
    str = remove_non_ascii(str)
    str = remove_punctuation(str)
    str = remove_laugh(str)
    str = remove_multi_space(str) 
    str = remove_repeated_character(str)    
    str = casefolding(str)
    str = normalize_slang_word(str)
    
    return str

In [4]:
train = pd.read_excel("data/label/combineSSL1.xlsx")
train.replace('', np.nan, inplace=True)
train.dropna(inplace=True) # memastikan tidak ada nilai yang kosong

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['tweet'].values)
maxlen = max([len(i.split()) for i in train['tweet'].values])

In [8]:
model = load_model('model/M1-SkenarioBiLSTM07.h5')  

In [9]:
print("Deteksi Konten")
button = widgets.ToggleButton(description="Check")
text = widgets.Text(layout=Layout(width='60%'))
output = widgets.Output()

def on_button_clicked(b):
    output.clear_output()      
    with output:
        kalimat = text.value
        kalimat = preprocessing(kalimat)
        kalimat = sent_tokenize(kalimat)
        kalimat = tokenizer.texts_to_sequences(kalimat)
        kalimat = pad_sequences(kalimat, maxlen=maxlen)
        pred = model.predict(kalimat)  
        print("Status Konten: ")
        if(np.argmax(pred) == 0):
            non_adult = colored('Bukan Konten Dewasa', color='grey', on_color='on_blue', attrs=['bold'])
            print(non_adult)
        elif (np.argmax(pred) == 1):
            adult = colored('Konten Dewasa', color='grey', on_color='on_red', attrs=['bold'])
            print(adult)

display(text)
display(button, output)
button.observe(on_button_clicked)

Deteksi Konten


Text(value='', layout=Layout(width='60%'))

ToggleButton(value=False, description='Check')

Output()