In [1]:
!pip install ktrain datasets accelerate transformers -U

Collecting ktrain
  Downloading ktrain-0.41.3.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [2]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import ktrain
from ktrain import text
import joblib
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.utils import resample
from imblearn.under_sampling import NearMiss
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
from datasets import Dataset, DatasetDict

**LOAD DATA**

In [3]:
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/dataset/dataset_final.csv'

ValueError: mount failed

In [None]:
df = pd.read_csv(file_path)
df.head()

In [None]:
df = df.drop(columns=['Unnamed: 0'])
df.isna().sum()

In [None]:
label_counts = df['label_num'].value_counts()

print(label_counts)

In [None]:
#Melakukan balancing data dengan oversampling
df_majority = df[df['label_num'] == 1]
upsampled_minority_classes = []
for label_num in range(0, 8):
  if label_num == 1:
    continue
  else:
    df_minority = df[df['label_num'] == label_num]
    df_minority_upsampled = resample(df_minority,
                                     replace=True,
                                     n_samples=len(df_majority),
                                     random_state=42)
    upsampled_minority_classes.append(df_minority_upsampled)


df_upsampled = pd.concat([df_majority] + upsampled_minority_classes)

In [None]:
label_counts = df_upsampled['label_num'].value_counts()

print(label_counts)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_upsampled['text'])
Y = df_upsampled['label_num']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

joblib.dump(vectorizer, '/content/drive/My Drive/dataset/tfidf_vectorizer.pkl')

# RoBERTa

In [None]:
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)

In [None]:
dataset = Dataset.from_pandas(df_upsampled[['text','label_num']])
dataset = dataset.rename_column('label_num', 'label')

dataset = dataset.train_test_split(test_size=0.2)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

In [None]:

predictions = trainer.predict(tokenized_datasets['test'])

predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = tokenized_datasets['test']['label']

conf_matrix = confusion_matrix(true_labels, predicted_labels)

class_report = classification_report(true_labels, predicted_labels)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(true_labels),
            yticklabels=np.unique(true_labels))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(class_report)