#Code Preparation

In [None]:
!pip install torch torchvision
!pip install transformers
!pip install indonlu
!pip install nltk
!pip install tqdm
!git clone https://github.com/indobenchmark/indonlu.git

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer
from torch.utils.data import DataLoader, Dataset

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
set_seed(19072021)

**LOAD MODEL**

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

In [None]:
count_param(model)

**PREPARE DATASET**

In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-5)
model = model.cuda()

#Main Code


TOKENIZER NLTK




In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Load CSV data
df = pd.read_csv('/content/MRT-TJ-fix.csv')  # Ganti dengan path file CSV Anda

# Periksa beberapa contoh data
print(df[['full_text', 'sentiment']].head())

# Tokenisasi awal menggunakan NLTK
df['tokenized_text'] = df['full_text'].apply(lambda x: word_tokenize(str(x).lower()) if pd.notnull(x) else [])

# Print beberapa contoh hasil tokenisasi awal
print(df[['full_text', 'tokenized_text']].head())

CONVERSION & REJOIN

In [None]:
# Asumsikan Anda sudah memuat data dari slangWord-2023.txt sebagai JSON object
with open('/content/slangWordGbgFix.txt', 'r') as file:
    data_json = file.read()

# Memuat JSON object
data_obj = json.loads(data_json)

# Fungsi untuk mengubah token slang ke kata baku menggunakan loop
def convert_slang_loop(token_list, slang_dict):
    normalized_tokens = []
    for token in token_list:  # <-- Loop pertama
        if token in slang_dict:
            normalized_tokens.append(slang_dict[token])
        else:
            normalized_tokens.append(token)
    return normalized_tokens

# Terapkan konversi slang ke tokenized_text menggunakan loop
normalized_text = []
for tokens in tqdm(df['tokenized_text'], desc="Converting slang words"):  # <-- Loop kedua
    normalized_text.append(" ".join(convert_slang_loop(tokens, data_obj)))

# Simpan hasil normalisasi ke DataFrame
df['normalized_text'] = normalized_text

# Print beberapa contoh hasil konversi
print(df[['normalized_text']].head())



In [None]:
print(df[['tokenized_text']].head())

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

# Get the stopwords from NLTK
stop_words = set(stopwords.words('indonesian'))

# Function to remove stopwords
def remove_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

# Apply the function to the DataFrame
df['cleaned_text'] = df['normalized_text'].apply(remove_stopwords)

print(df[['normalized_text', 'cleaned_text']])


TOKENIZER BERT

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df[['cleaned_text', 'sentiment']], test_size=0.1, random_state=42)

In [None]:
test_df, val_df = train_test_split(test_df[['cleaned_text', 'sentiment']], test_size=0.5, random_state=42)

In [None]:
main_df = df[['cleaned_text', 'sentiment']]
main_df.head()

In [None]:
train_df.to_csv('train_df.tsv', sep='\t', header=False, index=False)
test_df.to_csv('test_df.tsv', sep='\t', header=False, index=False)
val_df.to_csv('val_df.tsv', sep='\t', header=False, index=False)

In [None]:
train_dataset_path = '/content/train_df.tsv'

train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)

In [None]:
test_dataset_path = '/content/test_df.tsv'

test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)

#Model

In [None]:
n_epochs = 15
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

In [None]:
    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(test_loader, leave=True, total=len(test_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("TEST LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

In [None]:
text = 'naik transjakarta selalu kena macet'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
val_dataset_path = '/content/val_df.tsv'

val_dataset = DocumentSentimentDataset(val_dataset_path, tokenizer, lowercase=True)

val_loader = DocumentSentimentDataLoader(dataset=val_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)

In [None]:
model.eval()
torch.set_grad_enabled(False)


list_hyp, list_label = [], []

pbar = tqdm(val_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
results_df = pd.DataFrame({'label':list_hyp}).reset_index()
results_df.to_csv('results.csv', index=False)

print(df)

In [None]:
file_path = 'Sentiment_Analysis.pth'

# Save the model
torch.save(model.state_dict(), file_path)

#Result Visualization

In [None]:
print(df)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Create a barplot
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='sentiment')
plt.title('Amount of Data for Each Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
print()

# Function to generate word cloud for each sentiment
def generate_wordcloud(text, sentiment):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'WordCloud for {sentiment} Sentiment')
    plt.show()

# Generate word cloud for each sentiment
for sentiment in df['sentiment'].unique():
    text = ' '.join(df[df['sentiment'] == sentiment]['normalized_text'])
    generate_wordcloud(text, sentiment)
    print()