# Zero-Shot Classification for Data Labeling

In [1]:
!pip install transformers tqdm pandas --quiet

In [3]:
# import the required libraries
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# load the dataset
safaricom_df = pd.read_csv('unlabeled_data_v2.csv')

# set up the multilingual zero-shot classification pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli"  # <-- updated model
)

# define the labels/categories
labels = [
    "Customer care complaint",
    "MPESA complaint",
    "Network reliability problem",
    "Internet or airtime bundle complaint",
    "Data protection and privacy concern",
    "Neutral",
    "Hate Speech"
]

# apply the classifier to each tweet
def classify_tweet(text):
    try:
        result = classifier(text, labels, multi_label=False)
        return result['labels'][0]  # top predicted label
    except Exception as e:
        print(f"Error processing tweet: {e}")
        return "Unclassified"

# apply classification with progress bar
tqdm.pandas()
safaricom_df['Labels'] = safaricom_df['Content'].progress_apply(classify_tweet)

# save the labeled dataset
safaricom_df.to_csv('labeled_data_v5.csv', index=False)

config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0
  return forward_call(*args, **kwargs)
100%|██████████| 6146/6146 [12:24<00:00,  8.25it/s]


In [4]:
!pip install openai



In [5]:
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
import os

# Ensure your OpenAI API key is set as an environment variable
# It is highly recommended to set it this way for security
# For example, on Linux/macOS: export OPENAI_API_KEY="your-api-key-here"
# On Windows: set OPENAI_API_KEY="your-api-key-here"
# If you must set it in the script, uncomment and use the line below:
os.environ["OPENAI_API_KEY"] = "your-openai-key"

# Initialize the OpenAI client
try:
    client = OpenAI()
except Exception as e:
    print(f"Error initializing OpenAI client. Make sure your API key is set: {e}")
    exit()

# load the dataset
try:
    safaricom_df = pd.read_csv('unlabeled_data_v2.csv')
except FileNotFoundError:
    print("Error: 'unlabeled_data_v2.csv' not found. Please ensure the file is in the same directory.")
    exit()

# define the labels/categories
labels = [
    "Customer care complaint",
    "MPESA complaint",
    "Network reliability problem",
    "Internet or airtime bundle complaint",
    "Data protection and privacy concern",
    "Neutral",
    "Hate Speech"
]

# Create a prompt for the zero-shot classification
labels_string = ", ".join(f"'{label}'" for label in labels)
system_prompt = f"""
You are a zero-shot classifier. Your task is to classify a given tweet into one of the following categories: {labels_string}.
Read the tweet and choose the single category that best describes its content.

Your response should be only the name of the chosen category, nothing else. Do not provide any explanations or additional text.
"""

# apply the classifier to each tweet using the OpenAI API
def classify_tweet(text):
    """
    Classifies a single tweet using an OpenAI model.
    """
    if pd.isna(text):
        return "Unclassified"

    user_message = f"Classify the following tweet:\n\n'{text}'"

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=0.0,
            max_tokens=50
        )
        # Extract and return the classified label from the response
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Error processing tweet: {e}")
        return "Unclassified"

# apply classification with a progress bar
tqdm.pandas(desc="Classifying tweets")
safaricom_df['Labels'] = safaricom_df['Content'].progress_apply(classify_tweet)

# save the labeled dataset
safaricom_df.to_csv('labeled_data_openai.csv', index=False)

print("\nClassification complete. The labeled data has been saved to 'labeled_data_openai.csv'.")

Classifying tweets: 100%|██████████| 6146/6146 [1:03:30<00:00,  1.61it/s]


Classification complete. The labeled data has been saved to 'labeled_data_openai.csv'.



