In [1]:
import pandas as pd
from datasets import load_dataset
import numpy as np

# Load the Bitext dataset
print("Loading Bitext Customer Support Dataset...")
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
df = pd.DataFrame(dataset['train'])

print(f"\nTotal dataset size: {len(df)}")
print(f"\nDataset columns: {df.columns.tolist()}")

# Show category distribution
print("\n" + "="*80)
print("CATEGORY DISTRIBUTION")
print("="*80)
category_counts = df['category'].value_counts().sort_values(ascending=False)
print(category_counts)

# Define the categories used in the project
deterministic_categories = ['CONTACT', 'INVOICE', 'SHIPPING', 'SUBSCRIPTION', 'CANCEL']
indeterministic_categories = ['ACCOUNT', 'ORDER', 'FEEDBACK']
excluded_categories = ['REFUND', 'PAYMENT', 'DELIVERY']

print("\n" + "="*80)
print("PROJECT CLASSIFICATION")
print("="*80)
print(f"Deterministic categories: {deterministic_categories}")
print(f"  Total examples: {df[df['category'].isin(deterministic_categories)].shape[0]}")
print(f"\nIndeterministic categories: {indeterministic_categories}")
print(f"  Total examples: {df[df['category'].isin(indeterministic_categories)].shape[0]}")
print(f"\nExcluded categories: {excluded_categories}")
print(f"  Total examples: {df[df['category'].isin(excluded_categories)].shape[0]}")

# Function to display sample queries - always show 5 examples
def show_samples(df, category, n=5):
    print(f"\n{'='*80}")
    print(f"CATEGORY: {category} (Total: {len(df[df['category']==category])} examples)")
    print(f"{'='*80}")
    samples = df[df['category'] == category].sample(n=min(n, len(df[df['category']==category])), random_state=42)
    for idx, (i, row) in enumerate(samples.iterrows(), 1):
        print(f"\n--- Example {idx} ---")
        print(f"Intent: {row['intent']}")
        print(f"Instruction: {row['instruction']}")
        print(f"Response: {row['response']}")
        print()

# Show samples from EXCLUDED categories
print("\n" + "="*80)
print("EXCLUDED CATEGORIES - SAMPLE QUERIES (5 per category)")
print("="*80)

for category in excluded_categories:
    show_samples(df, category, n=5)

# Show samples from INCLUDED DETERMINISTIC categories
print("\n" + "="*80)
print("INCLUDED DETERMINISTIC CATEGORIES - SAMPLE QUERIES (5 per category)")
print("="*80)

for category in deterministic_categories:
    show_samples(df, category, n=5)

print("\n" + "="*80)
print("INCLUDED INDETERMINISTIC CATEGORIES - SAMPLE QUERIES (5 per category)")
print("="*80)

for category in indeterministic_categories:
    show_samples(df, category, n=5)

# Statistical Analysis
print("\n" + "="*80)
print("STATISTICAL COMPARISON")
print("="*80)

def analyze_category_group(df, categories, group_name):
    subset = df[df['category'].isin(categories)]
    print(f"\n{group_name}:")
    print(f"  Total examples: {len(subset)}")
    print(f"  Avg query length (chars): {subset['intent'].str.len().mean():.1f}")
    print(f"  Avg response length (chars): {subset['response'].str.len().mean():.1f}")
    print(f"  Avg query length (words): {subset['intent'].str.split().str.len().mean():.1f}")
    print(f"  Avg response length (words): {subset['response'].str.split().str.len().mean():.1f}")

analyze_category_group(df, deterministic_categories, "DETERMINISTIC CATEGORIES")
analyze_category_group(df, indeterministic_categories, "INDETERMINISTIC CATEGORIES")
analyze_category_group(df, excluded_categories, "EXCLUDED CATEGORIES")

# Detailed breakdown by individual category
print("\n" + "="*80)
print("PER-CATEGORY STATISTICS")
print("="*80)

for category in deterministic_categories + indeterministic_categories + excluded_categories:
    subset = df[df['category'] == category]
    print(f"\n{category}:")
    print(f"  Count: {len(subset)}")
    print(f"  Avg query length (words): {subset['intent'].str.split().str.len().mean():.1f}")
    print(f"  Avg response length (words): {subset['response'].str.split().str.len().mean():.1f}")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

Loading Bitext Customer Support Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Bitext_Sample_Customer_Support_Training_(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]


Total dataset size: 26872

Dataset columns: ['flags', 'instruction', 'category', 'intent', 'response']

CATEGORY DISTRIBUTION
category
ACCOUNT         5986
ORDER           3988
REFUND          2992
CONTACT         1999
INVOICE         1999
PAYMENT         1998
FEEDBACK        1997
DELIVERY        1994
SHIPPING        1970
SUBSCRIPTION     999
CANCEL           950
Name: count, dtype: int64

PROJECT CLASSIFICATION
Deterministic categories: ['CONTACT', 'INVOICE', 'SHIPPING', 'SUBSCRIPTION', 'CANCEL']
  Total examples: 7917

Indeterministic categories: ['ACCOUNT', 'ORDER', 'FEEDBACK']
  Total examples: 11971

Excluded categories: ['REFUND', 'PAYMENT', 'DELIVERY']
  Total examples: 6984

EXCLUDED CATEGORIES - SAMPLE QUERIES (5 per category)

CATEGORY: REFUND (Total: 2992 examples)

--- Example 1 ---
Intent: check_refund_policy
Instruction: I need to check in which cases can I ask for a reimbursement
Response: Of course! I completely understand your need to familiarize yourself with the cas