In [None]:
!pip install transformers torch pandas scikit-learn




In [None]:
import sys
import os
import glob
import re
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
import matplotlib.pyplot as plt


In [None]:
# --- Data Loader ---
def load_data(data_path):
    if not os.path.exists(data_path):
        print(f"Warning: Data path {data_path} does not exist.")
        return pd.DataFrame()

    csv_files = glob.glob(os.path.join(data_path, '*.csv'))

    if not csv_files:
        print(f"Warning: No CSV files found in {data_path}")
        return pd.DataFrame()

    df_list = []
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    if not df_list:
        return pd.DataFrame()

    full_df = pd.concat(df_list, ignore_index=True)
    return full_df

def clean_initial_data(df):
    if df.empty:
        return df

    emotion_columns = ['anger', 'confusion', 'disgust', 'fear', 'joy', 'love', 'sadness', 'surprise']
    available_cols = [col for col in emotion_columns if col in df.columns]

    if not available_cols:
        return df

    df_clean = df.copy()

    # 1. Bersihkan Teks Kosong (Code Lama Anda)
    if 'text' in df_clean.columns:
        df_clean = df_clean.dropna(subset=['text'])
        df_clean = df_clean[df_clean['text'].str.strip() != '']

    # 2. TAMBAHAN PENTING: Bersihkan Label Kosong (Penyebab Validation Loss: NaN)
    print(f"Sebelum cleaning label: {len(df_clean)} baris")

    # Pastikan label dianggap angka
    for col in available_cols:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Buang baris yang labelnya NaN (kosong)
    df_clean = df_clean.dropna(subset=available_cols)
    print(f"Setelah cleaning label: {len(df_clean)} baris")

    return df_clean


In [None]:
# --- Preprocessor ---
class TextPreprocessor:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.happy_emoticons = [':)', ': )', ':-)', '=)', ':]', ';)', ';-)']
        self.sad_emoticons = [':(', ': (', ': [', '= [', ':[', ':-(', '=(', '; [', ';(']
        self.flat_emoticons = [':/', ':|']
        self.emoticon_pattern = re.compile(r'[:;xX=]-?[dDpP/\\]?\s*[\(\)[\]\{\}|]')

    def replace_emoticons(self, text):
        if not isinstance(text, str):
            return ""
        for emo in self.happy_emoticons:
            text = text.replace(emo, '{happy_face}')
        for emo in self.sad_emoticons:
            text = text.replace(emo, '{sad_face}')
        for emo in self.flat_emoticons:
            text = text.replace(emo, '{flat_face}')
        text = re.sub(self.emoticon_pattern, '', text)
        return text

    def preprocess(self, df, text_column='text'):
        df = df.copy()
        df['text_processed'] = df[text_column].apply(self.replace_emoticons)
        return df


In [None]:
# --- Model & Dataset ---
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

def get_model(num_labels):
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )
    return model


In [None]:
# --- Training Logic ---
EMOTION_COLUMNS = ['anger', 'confusion', 'fear', 'joy', 'sadness']

def train_model(data_path, output_dir, epochs=3, sample_size=None):
    print("Loading data...")
    # For Colab, users might upload files directly to /content/
    # So we check if data_path exists, if not assume /content/

    df = load_data(data_path)
    if df.empty:
        print(f"No data found in {data_path}. Please upload your CSV files (goemotions_*.csv).")
        return None

    df = clean_initial_data(df)

    available_emotions = [col for col in EMOTION_COLUMNS if col in df.columns]
    if not available_emotions:
        print(f"Error: None of the expected emotion columns {EMOTION_COLUMNS} found in dataset.")
        return None

    if sample_size:
        print(f"Sampling {sample_size} rows for debugging...")
        df = df.sample(n=min(sample_size, len(df)), random_state=42)

    print("Preprocessing...")
    preprocessor = TextPreprocessor()
    df = preprocessor.preprocess(df)

    X = df['text_processed']
    y = df[available_emotions]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Tokenizing...")
    train_encodings = preprocessor.tokenizer(
        X_train.tolist(), truncation=True, padding=True, max_length=128
    )
    val_encodings = preprocessor.tokenizer(
        X_val.tolist(), truncation=True, padding=True, max_length=128
    )

    train_dataset = EmotionDataset(train_encodings, y_train)
    val_dataset = EmotionDataset(val_encodings, y_val)

    print("Initializing model...")
    model = get_model(len(available_emotions))

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=32,  # Naikkan dari 16 ke 32 (atau 64 jika muat)
        per_device_eval_batch_size=32,
        fp16=True,                       # WAJIB untuk T4: Mengaktifkan Mixed Precision
        dataloader_num_workers=2,        # Mempercepat loading data (paralel CPU)
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        eval_strategy="epoch",  # Updated from evaluation_strategy
        save_strategy="epoch",
        logging_dir=os.path.join(output_dir, 'logs'),
        save_total_limit=2,          # Hemat storage, cuma simpan 2 model terbaik
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    print("Starting training...")
    trainer.train()

    print("Saving model...")
    model.save_pretrained(output_dir)
    preprocessor.tokenizer.save_pretrained(output_dir)
    print("Training complete.")
    return preprocessor


# Instructions
# 1. Run the first cell to install dependencies.
# 2. Upload your 'goemotions_*.csv' files to the 'Files' tab on the left (they will appear in /content/).
# 3. Run the valid cells to define classes.
# 4. Run the training cell below.


In [None]:
# Define paths
data_dir = '/content'  # Default Colab directory
output_dir = '/content/models'

# START TRAINING
# Set sample_size=100 for a quick test, or None for full training
preprocessor = train_model(data_dir, output_dir, epochs=3, sample_size=None)


Loading data...
Preprocessing...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing...
Initializing model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.0997,0.09864


Epoch,Training Loss,Validation Loss
1,0.0997,0.09864
2,0.0865,0.099315


In [None]:
# Zip the model for download
import shutil
from google.colab import files

if os.path.exists(output_dir):
    shutil.make_archive('emotion_model', 'zip', output_dir)
    print("Model zipped. Downloading...")
    try:
        files.download('emotion_model.zip')
    except Exception as e:
        print("Auto-download failed. Please download 'emotion_model.zip' from the files tab.")
