In [None]:
!apt update
!apt -q install firefox-geckodriver python3-selenium
!pip install transformers

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [1 In[0m[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Connecting to security.ub[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
[33m0% [Waiting for headers] [3 InRelease 14.2 kB/114 kB 12%] [Waiting for headers][0m                                                                               Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRele

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox, FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.models import load_model
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.nn import softmax

In [None]:
class ReviewScraper:
    def __init__(self):
        # Set up Firefox options
        options = FirefoxOptions()
        options.add_argument('-headless')
        options.set_preference('intl.accept_languages', 'id-ID')

        # Initialize WebDriver
        self.driver = Firefox(options=options)
        self.waiter = WebDriverWait(self.driver, 10)

    def scrape(self, url):
        self.articles = {}

        self.driver.get(url)

        review_btn = self.waiter.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'hqzQac')))

        # Close cookie popup
        try:
            self.driver.find_element(By.CLASS_NAME, 'L2AGLb').click()
        except:
            pass

        review_btn.click()

        scrollable_div = self.waiter.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'review-dialog-list')))

        for x in range(3):
            self.driver.execute_script(
                "arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_div)
            self.waiter.until(
                EC.invisibility_of_element_located((By.CLASS_NAME, 'jfk-activityIndicator')))

        soup = BeautifulSoup(self.driver.page_source, "lxml")

        for item in soup.select('.WMbnJf'):
            div_isi = item.select_one('.review-full-text')
            if not div_isi:
                div_isi = item.select_one('.Jtu6Td span span')
            nama = item.select_one('.TSUbDb').text
            gambar = item.select_one('.lDY1rd').get('src')
            like = item.select_one('.QWOdjf').text
            rating = self.extract_first_number(item.select_one('.lTi8oc').get('aria-label'))
            isi = div_isi.text
            if nama not in self.articles:
                self.articles[nama] = {'isi': isi, 'rating': rating, 'gambar': gambar, 'like': like}

        return self.articles

    @staticmethod
    def extract_first_number(sentence):
        pattern = r'(\d+(\.\d+)?)'
        match = re.search(pattern, sentence)
        if match:
            number = float(match.group(0))
            return number
        return None

class SentimentAnalyzer:
    def __init__(self, model_path, max_len=200):
        # Load tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

        # Load the trained model
        self.model = load_model(model_path, custom_objects={'TFBertModel': TFBertModel})

        self.max_len = max_len

    def predict_sentiment(self, text):
        encoded_text = self.encode_reviews(self.tokenizer, [text], self.max_len)
        prediction = self.model.predict(encoded_text)
        sentiment = np.argmax(prediction, axis=-1)[0]
        return sentiment

    @staticmethod
    def encode_reviews(tokenizer, reviews, max_length):
        token_ids = np.zeros(shape=(len(reviews), max_length), dtype=np.int32)
        for i, review in enumerate(reviews):
            encoded = tokenizer.encode(review, max_length=max_length, truncation=True, padding='max_length')
            token_ids[i] = encoded
        attention_mask = (token_ids != 0).astype(np.int32)
        return {'input_ids': token_ids, 'attention_mask': attention_mask}


class DistilledSentimentAnalyzer:
    def __init__(self, model_path, max_len=200):
        # Load tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

        # Load the trained model
        self.model = load_model(model_path, custom_objects={'TFDistilBertModel': TFDistilBertModel})

        self.max_len = max_len

    def predict_sentiment(self, text):
        inputs = self.encode_review(self.tokenizer, text, self.max_len)
        prediction = self.model.predict([inputs['input_ids'], inputs['attention_mask']])
        probabilities = softmax(prediction)
        sentiment = np.argmax(probabilities, axis=-1)[0]
        return sentiment

    @staticmethod
    def encode_review(tokenizer, review, max_length):
        inputs = tokenizer.encode_plus(
            review,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_length,  # Pad & truncate all sentences
            padding='max_length',
            return_attention_mask=True,  # Construct attention masks
            return_tensors='tf',  # Return tf tensors
        )
        return inputs

# Predict

In [None]:
# Load the model
analyzer = DistilledSentimentAnalyzer(
    'drive/MyDrive/Traversee/my_student_model_continued.h5')

# Scrape reviews
scraper = ReviewScraper()



In [None]:
reviews = scraper.scrape("https://www.google.com/search?client=firefox-b-d&q=pantai+alam+indah")

# Add sentiment predictions to reviews
for name, review in reviews.items():
    sentiment = analyzer.predict_sentiment(review['isi'])
    review['sentimen'] = sentiment

reviews



{'Rimarsya Caca': {'isi': 'htm 12k pas work day bisa naik 2x lipat pas weekend, saya selalu betah kalo main disini tempatnya adem, sejuk juga banyak jajanannya, suasananya cukup menenangkan, walaupun ada banyak sampah yang masih belum bisa di kelola dengan baik oleh pengunjung maupun pihak PAI sendiri',
  'rating': 5.0,
  'gambar': 'https://lh3.googleusercontent.com/a/AAcHTtepI4qgratc7VaZ0811QUCFWqlQUHRPFPdu-7p_=s40-c-c0x00000000-cc-rp-mo-ba3-br100',
  'like': '0',
  'sentimen': 2},
 'indrawati': {'isi': 'Bener-bener indah bgt sesuai namanya 😍 destinasi wisata terwajib bagi yg ingin berkunjung ke tegal. Alami bgt pantainya, gak begitu kotor, pemandangannya luar biasa MasyaAllah, recommended bgt kalo kesini sore-malam.',
  'rating': 5.0,
  'gambar': 'https://lh3.googleusercontent.com/a-/AD_cMMQKbE3vkIOTB1M_bbE_88UrXat7x_QEC0hPUuNN9w=s40-c-c0x00000000-cc-rp-mo-ba4-br100',
  'like': '10',
  'sentimen': 2},
 'muhajir saja': {'isi': 'View pantai khas pantai utara Jawa. Pantai dengan ombak t

# Prepare

In [None]:
#!curl -sLo menara_pandang.csv "https://docs.google.com/spreadsheets/d/11EP2bGzUS8bPGpN2xxPCoDgt5xlbrbRZ/export?format=csv&gid=618304663"
#!curl -sLo merapi.csv "https://docs.google.com/spreadsheets/d/1fd1MHYmYZMw222KiWIaa-YfWoIItLbay/export?format=csv&gid=324924873"
#!curl -sLo pantai_alam_indah.csv "https://docs.google.com/spreadsheets/d/1wJFKdHxTuAjNdpoW8zQJRduxLMKdYBYs/export?format=csv&gid=1364670710"
#!curl -sLo prambanan.csv "https://docs.google.com/spreadsheets/d/1zBnRygWcuJUiml0XJodHUxA0wXtGfPZZ/export?format=csv&gid=1163771719"

# Load trainset
df1 = pd.read_csv('menara_pandang.csv')
df2 = pd.read_csv('merapi.csv')
df3 = pd.read_csv('pantai_alam_indah.csv')
df4 = pd.read_csv('prambanan.csv')

df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
df['label'] = df['label'].replace([-1, 0, 1], [0, 1, 2])

max_len = 200

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
encoded_inputs = SentimentAnalyzer.encode_reviews(tokenizer, df['review'], max_len)

#Train

In [None]:
# Initialize the model architecture same as in the SentimentAnalyzer class
indobert_model = TFBertModel.from_pretrained('indobenchmark/indobert-base-p1')

for layer in indobert_model.layers:
    layer.trainable = False

input_ids = Input(shape=(max_len,), dtype=np.int32, name='input_ids')
attention_mask = Input(shape=(max_len,), dtype=np.int32, name='attention_mask')
outputs = indobert_model({'input_ids': input_ids, 'attention_mask': attention_mask})
x = Dropout(0.1)(outputs[1])
x = Dense(32, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(3, activation='softmax')(x)
model = Model(inputs=[input_ids, attention_mask], outputs=[x])

# Define loss and optimizer
loss = SparseCategoricalCrossentropy()
optimizer = Adam()#(learning_rate=0.0005)

# Compile and train
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.fit(encoded_inputs, np.array(df['label']), epochs=10, batch_size=16)

# Save the whole model
model.save('drive/MyDrive/Traversee/my_model.h5')

# Distill

In [None]:
# Load the teacher model
#teacher_model = load_model('drive/MyDrive/Traversee/my_model.h5',
#                           custom_objects={'TFBertModel': TFBertModel})

# Instantiate the DistilBERT tokenizer and model
student_tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
student_model = TFDistilBertModel.from_pretrained('afbudiman/indobert-distilled-optimized-for-classification', from_pt=True)

for layer in student_model.layers:
    layer.trainable = False

# Define the architecture of the student model
input_ids = Input(shape=(max_len,), dtype=np.int32, name='input_ids')
attention_mask = Input(shape=(max_len,), dtype=np.int32, name='attention_mask')
outputs = student_model({'input_ids': input_ids, 'attention_mask': attention_mask})

# Get the hidden states from the first transformer layer
hidden_state = outputs.last_hidden_state

# Use only the hidden state from the [CLS] token (at position 0)
cls_token = hidden_state[:, 0, :]

x = Dropout(0.1)(cls_token)
x = Dense(32, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(3, activation='softmax')(x)
student = Model(inputs=[input_ids, attention_mask], outputs=[x])

# Compile the student model
student.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Generate soft labels with the teacher model
#teacher_predictions = teacher_model.predict(encoded_inputs)
teacher_predictions = np.load('drive/MyDrive/Traversee/teacher_predictions.npy')

# Train the student model on the same training set, but with the targets
# being the soft labels generated by the teacher model
student.fit(encoded_inputs, teacher_predictions, epochs=10)

# Save the student model
student.save('drive/MyDrive/Traversee/my_student_model.h5')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Eksperimen

In [None]:
# Instantiate the DistilBERT tokenizer and model
student_tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

# Load the previously trained model
student = load_model('drive/MyDrive/Traversee/my_student_model_continued.h5',
                     custom_objects={'TFDistilBertModel': TFDistilBertModel})

# Freeze all layers in the model
for layer in student.layers:
    layer.trainable = False

# Unfreeze the last 3 layers
for layer in student.layers[-5:]:
    layer.trainable = True

# Compile the model
student.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Load the teacher predictions (you need to have the same 'encoded_inputs' as when you trained the model)
teacher_predictions = np.load('drive/MyDrive/Traversee/teacher_predictions.npy')

# Continue training the model for another 5 epochs
student.fit(encoded_inputs, teacher_predictions, epochs=5)

# Save the model
student.save('drive/MyDrive/Traversee/my_student_model_continued_aggresive.h5')



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
