In [1]:
# 1. RE-INSTALL LIBRARIES (Required after changing runtime)
!pip install emoji transformers[sentencepiece] torch tqdm

import os
# Fix for the Protobuf error
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

import pandas as pd
import emoji
from transformers import pipeline
import torch
from tqdm import tqdm

def run_full_gpu_labeling():
    # 2. LOAD DATA
    # Ensure you have uploaded 'final_data.csv' to the Colab 'Files' folder again
    if not os.path.exists('final_data.csv'):
        print("ERROR: Please upload 'final_data.csv' to the files tab on the left.")
        return

    df = pd.read_csv('final_data.csv')
    df['text'] = df['text'].astype(str).fillna('')

    # 3. PREPROCESS
    print("Converting emojis...")
    df['clean_text'] = df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

    # 4. INITIALIZE MODEL ON GPU
    # device=0 tells Hugging Face to use the GPU (T4)
    print("Loading Model onto GPU...")
    model_id = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
    classifier = pipeline("zero-shot-classification", model=model_id, device=0)

    candidate_labels = [
        "neutral or positive",
        "constructive criticism",
        "offensive language",
        "cyberbullying",
        "hate speech",
        "irony"
    ]

    # 5. FAST BATCH PROCESSING
    # On GPU, we can use a larger batch size (32 or 64) to speed things up
    batch_size = 32
    all_labels = []
    all_scores = []

    texts = df['clean_text'].tolist()
    print(f"Starting classification of {len(texts)} rows on GPU...")

    try:
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i : i + batch_size]
            results = classifier(batch, candidate_labels, multi_label=False)

            for res in results:
                all_labels.append(res['labels'][0])
                all_scores.append(res['scores'][0])

        # 6. SAVE RESULTS
        df['predicted_label'] = all_labels
        df['confidence_score'] = all_scores

        df.to_csv('final_labeled_kenyan_dataset.csv', index=False)
        print("\nSUCCESS! Download your file 'final_labeled_kenyan_dataset.csv' from the files tab.")

    except Exception as e:
        print(f"\nError: {e}")

if __name__ == "__main__":
    run_full_gpu_labeling()

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
Converting emojis...
Loading Model onto GPU...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]

DebertaV2ForSequenceClassification LOAD REPORT from: MoritzLaurer/mDeBERTa-v3-base-mnli-xnli
Key                             | Status     |  | 
--------------------------------+------------+--+-
deberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Starting classification of 10055 rows on GPU...


  3%|▎         | 10/315 [00:55<24:37,  4.84s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 315/315 [24:33<00:00,  4.68s/it]



SUCCESS! Download your file 'final_labeled_kenyan_dataset.csv' from the files tab.
