In [None]:
!pip install transformers



In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive/')


Mounted at /content/drive/


# Testing 1

## Model 1

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
path = '/content/drive/MyDrive/Colab Notebooks/Resha Ananda Rahman (ABSA)/Data Gojek/label_sentiment_vader_gojek.csv'
vader = pd.read_csv(path)

# Define hyperparameters for fine-tuning
learning_rate = 1e-5
batch_size = 32
max_seq_length = 128
num_train_epochs = 3

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Define the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize input text and convert to PyTorch tensors with fixed size of max_seq_length
max_length = max_seq_length

# Assuming you have data and labels defined previously
tokenized_texts = vader['Content'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length, truncation=True))
padded_texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in tokenized_texts], batch_first=True, padding_value=0)
attention_masks = (padded_texts != 0).float()  # Create attention masks based on padded values

# Trim or pad the sequences to max_length
padded_texts = padded_texts[:, :max_length]
attention_masks = attention_masks[:, :max_length]

# Split the dataset into training and validation sets
labels = torch.tensor(vader['Sentiment'].map({'Positive': 1, 'Negative': 0}).values)
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    padded_texts, attention_masks, labels, test_size=0.15, random_state=42
)

# Create DataLoader for training and validation sets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


In [None]:
# Move model to the appropriate device
model.to(device)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for epoch in range(num_train_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_train_epochs}, Average Loss: {avg_loss}")


Epoch 1/3, Average Loss: 0.2694565971848872
Epoch 2/3, Average Loss: 0.15588589655809684
Epoch 3/3, Average Loss: 0.10219826291118007


In [None]:
# Inisialisasi vader_score sebelum loop
vader_score = []

# Evaluate on the validation set
model.eval()
val_loss = 0
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

avg_val_loss = val_loss / len(val_dataloader)

# Menghitung evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

# Menampilkan evaluation
print(f"Validation Loss: {avg_val_loss:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Menambahkan hasil evaluasi ke dalam list
vader_score.append({'Accuracy': accuracy,
                   'Precision': precision,
                   'Recall': recall,
                   'F1-score': f1})

# Menampilkan hasil evaluasi untuk semua aspek
print("\nEvaluation for SVM Data Gojek-Vader:")
vader_score_df = pd.DataFrame(vader_score)
print(vader_score_df)


Validation Loss: 0.13
Accuracy: 0.95
Precision: 0.98
Recall: 0.96
F1 Score: 0.97

Evaluation for SVM Data Gojek-Vader:
   Accuracy  Precision    Recall  F1-score
0  0.952517   0.976027  0.956376  0.966102


## Predict 1

In [None]:
import pickle
# Simpan model
model_path = '/content/drive/MyDrive/Colab Notebooks/Resha Ananda Rahman (ABSA)/Model/distilbert_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

# Simpan tokenizer
tokenizer_path = '/content/drive/MyDrive/Colab Notebooks/Resha Ananda Rahman (ABSA)/Model/distilbert_tokenizer.pkl'
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pickle

# Load model
model_path = '/content/drive/MyDrive/Colab Notebooks/Resha Ananda Rahman (ABSA)/Model/distilbert_model.pkl'
with open(model_path, 'rb') as f:
    model = pickle.load(f)

# Load tokenizer
tokenizer_path = '/content/drive/MyDrive/Colab Notebooks/Resha Ananda Rahman (ABSA)/Model/distilbert_tokenizer.pkl'
with open(tokenizer_path, 'rb') as f:
    tokenizer = pickle.load(f)

# Input teks baru
new_text = input("\nMasukkan teks baru: ")

# Tokenisasi teks baru
inputs = tokenizer(new_text, return_tensors="pt")

# Prediksi dengan model
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
predicted_label = model.config.id2label[predicted_class_id]

print("Predicted Sentiment:", predicted_label)



Masukkan teks baru: this place so disgusting
Predicted Sentiment: NEGATIVE
