In [None]:
!pip install emoji transformers torch

In [None]:
import pandas as pd
import emoji
import torch
from transformers import pipeline
from tqdm import tqdm

# 1. Configuration
INPUT_FILE = 'final_data.csv'
OUTPUT_FILE = 'labeled_kenyan_comments.csv'
BATCH_SIZE = 16  # Adjust based on your GPU memory (8 or 16 is usually safe)

# Your 6 specific categories
CANDIDATE_LABELS = [
    "neutral or positive", 
    "constructive criticism", 
    "offensive language", 
    "cyberbullying", 
    "hate speech", 
    "irony"
]

def process_and_label():
    # Load data
    print("Loading dataset...")
    df = pd.read_csv(INPUT_FILE)
    
    # Pre-processing: Fill missing text and convert emojis to words
    df['text'] = df['text'].astype(str).fillna('')
    print("Converting emojis to text descriptions...")
    df['processed_text'] = df['text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

    # Initialize the Pipeline
    # Using DeBERTa-v3-base-mnli-xnli because it is state-of-the-art for multilingual zero-shot
    device = 0 if torch.cuda.is_available() else -1
    print(f"Initializing model on {'GPU' if device == 0 else 'CPU'}...")
    
    classifier = pipeline(
        "zero-shot-classification", 
        model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
        device=device
    )

    # 2. Batch Processing
    results_labels = []
    results_scores = []
    
    texts = df['processed_text'].tolist()
    
    print(f"Starting classification for {len(texts)} rows...")
    # Process in batches to prevent memory overflow
    for i in tqdm(range(0, len(texts), BATCH_SIZE)):
        batch = texts[i : i + BATCH_SIZE]
        
        # Perform classification
        # multi_label=False ensures it picks the single most likely category
        outputs = classifier(batch, CANDIDATE_LABELS, multi_label=False)
        
        # Extract the top label and score for each item in batch
        for out in outputs:
            results_labels.append(out['labels'][0])
            results_scores.append(out['scores'][0])

    # 3. Save results
    df['predicted_label'] = results_labels
    df['confidence_score'] = results_scores
    
    # Drop the temporary processed_text column before saving
    df = df.drop(columns=['processed_text'])
    
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"Success! Labeled data saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    process_and_label()

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates