In [8]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import Sequential, regularizers
from keras.layers import LSTM, Embedding, Dropout, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from apiclient import discovery
tokenizer = None
model = None
FILEPATH = 'Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled.csv'
SLANGPATH = 'colloquial-indonesian-lexicon.csv'
slang_dictionary = None
sentiment_amount = 3
sentiments = ["neutral", "positive", "negative"]

In [9]:
# get dataset + preprocessing teks
def get_dataset():
    dataset = pd.read_csv(FILEPATH)

    all_data = []
    dataset = dataset[dataset['sentimen'].isin([0, 1, 2])]
    smallest = list(dataset['sentimen']).count(0)
    for i in range(sentiment_amount):
        smallest = min(smallest, list(dataset['sentimen']).count(i))
    for i in range(sentiment_amount):
        data = dataset[dataset['sentimen'] == i]
        pos = np.random.choice(data.index, len(data) - smallest, replace=False)
        data = data.drop(pos)
        all_data.append(data)

    dataset = pd.concat(all_data)

    for index, data in dataset.iterrows():
        text = data['Tweet']
        text = str(text).lower()
        word_list = word_tokenize(text)
        temp = []
        if slang_dictionary == None:
            initialize_slang_dictionary()
        for word in word_list:
            if slang_dictionary.get(word):
                temp.append(slang_dictionary[word])
            else:
                temp.append(word)
        word_list = temp
        id_stopwords = stopwords.words('indonesian')
        word_list = [word for word in word_list if word not in id_stopwords]
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        word_list = [stemmer.stem(word) for word in word_list]
        text = ' '.join(word_list)
        dataset.at[index, 'Tweet'] = text

    dataset = dataset.sample(frac = 1)

    return dataset

def initialize_slang_dictionary():
    global slang_dictionary
    slang_dictionary = {}
    dataset = pd.read_csv(SLANGPATH)
    for _, data in dataset.iterrows():
        key = data['slang']
        value = data['formal']
        slang_dictionary[key] = value

# training model
def train_model():
    dataset = get_dataset()
    x, y = preprocess_data(dataset['Tweet'], dataset['sentimen'])
    global model
    initialize_model()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)
    model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), callbacks=[early_stopping])
    score = model.evaluate(x_test, y_test, verbose = 1)
    y_pred = model.predict(x_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test_argmax = np.argmax(y_test, axis=1)

    print('Score: ', score)
    print('Accuracy: ', accuracy_score(y_test_argmax, y_pred))
    print('Precision:', precision_score(y_test_argmax, y_pred, average='weighted'))
    print('Recall:', recall_score(y_test_argmax, y_pred, average='weighted'))
    print('F1 score:', f1_score(y_test_argmax, y_pred, average='weighted'))
    print('Confusion Matrix: \n', confusion_matrix(y_test_argmax, y_pred))
    model.save('LSTM.keras')

def preprocess_data(text_list, sentiments):
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()

    sequences = tokenizer.texts_to_sequences(text_list)

    x = pad_sequences(sequences)

    le = LabelEncoder()
    y = le.fit_transform(sentiments)
    y = to_categorical(y)

    return x, y

def initialize_tokenizer():
    dataset = get_dataset()
    global tokenizer
    tokenizer = Tokenizer(num_words=4000)
    tokenizer.fit_on_texts(dataset['Tweet'])

def initialize_model():
    global model
    word_count = get_word_count()
    model = Sequential([
        Embedding(input_dim=word_count + 1, output_dim=200),
        LSTM(128, return_sequences=True),
        Dropout(0.5),
        LSTM(128),
        Dense(64, activation='relu', 
            kernel_regularizer=regularizers.l2(0.01), 
            activity_regularizer=regularizers.l1(0.01)),
        Dense(sentiment_amount, activation='softmax')
    ])

def get_word_count():
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    return len(tokenizer.word_index)

# testing model
def load_model():
    global model
    if model == None:
        initialize_model()
    model.load_weights('LSTM.keras')

def predict_comment(text):
    global model
    global tokenizer
    if tokenizer == None:
        initialize_tokenizer()
    if model == None:
        initialize_model()
    sequences = tokenizer.texts_to_sequences(text)
    x = pad_sequences(sequences)
    y_pred = model.predict(x)
    y_pred = np.argmax(y_pred, axis=1)

    for index, comment in enumerate(text):
        print(f"{comment}: {sentiments[y_pred[index]]}")

In [10]:
# Inisialisasi Youtube API
api_key = ''
youtube = discovery.build('youtube', 'v3', developerKey=api_key)

def get_comments(video_id):
    comments = []
    response = youtube.commentThreads().list(
        part = 'snippet',
        videoId = video_id,
        maxResults = 600,     # menyesuaikan jumlah ini
        textFormat = 'plainText'
    ).execute()
    while response:
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

            if len(comments) >= 10000:     # Batas jumlah komentar
                return comments

        if 'nextPageToken' in response:
            response = youtube.commentThreads().list(
            part = 'snippet',
            videoId = video_id,
            pageToken = response['nextPageToken'],
            maxResults = 100,     # menyesuaikan jumlah ini
            textFormat = 'plainText'
        ).execute()
        else:
            break
    return comments

In [11]:
train_model()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping
Score:  [1.1589776277542114, 0.5942720770835876]
Accuracy:  0.594272076372315
Precision: 0.598995906917254
Recall: 0.594272076372315
F1 score: 0.5957466732415336
Confusion Matrix: 
 [[321 133 100]
 [173 330  66]
 [121  87 345]]


In [14]:
comments = get_comments('RRgLZ66NCmE')
predict_comment(comments)

TimeStamp Film:

00:00:00 Opening
00:12:35 Peta Suara Sumatera Pilpres 2014 dan 2019
00:13:47 Pemecahan Papua menjadi 6 Provinisi
00:16:46 Penunjukan 20 PJ Gubernur dan 82 PJ Walikota/Bupati dipilih Presiden
00:15:10 Kewenangan yang dimiliki Jenderal Tito Karnavian
00:18:01 Pelanggaran keputusan MK dalam penunjukan kepala daerah
00:19:45 Relasi PJ Gubernur dengan presiden
00:21:40 Kecenderungan tidak netral PJ Gubernur
00:22:55 Kecenderungan tidak netral PJ Bupati
00:25:17 Kewenangan PJ Kepala Daerah melarang kampanye
00:25:33 Kontroversi deklarasi desa bersatu untuk paslon tertentu
00:28:41 Politisasi kasus penyelewengan dana desa
00:30:42 Tekanan kepada kepala desa untuk mendukung paslon tertentu
00:35:20 Fakta bansos jelang pemilu
00:38:28 Gaji ASN, Polri, dan pensiunan PNS naik di 2024
00:40:00 Pengabaian data kemensos dalam penyaluran bansos
00:42:00 Bansos digunakan sebagai alat politik
00:53:00 Presiden tidak netral
00:55:50 Presiden dan menteri-menteri di pihak koalisi 02
00:57