In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import pandas as pd

zip_path = "/content/drive/MyDrive/dataset.zip"

# Check inside the zip
with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())   # <-- copy the exact names from here

# Open and read CSV directly from the zip
with zipfile.ZipFile(zip_path) as z:
    with z.open("comments1.csv") as f:
        comments_1 = pd.read_csv(f)

    with z.open("comments2.csv") as f:
        comments_2 = pd.read_csv(f)

    with z.open("comments3.csv") as f:
        comments_3 = pd.read_csv(f)

    with z.open("comments4.csv") as f:   # use exact path from namelist()
        comments_4 = pd.read_csv(f)

    with z.open("comments5.csv") as f:
        comments_5 = pd.read_csv(f)


In [None]:
for i, df in enumerate([comments_1, comments_2, comments_3, comments_4,comments_5], start=1):
    df["dataset_id"] = i

# Combine
df_all_comments = pd.concat([comments_1, comments_2, comments_3, comments_4,comments_5], ignore_index=True)

print(df_all_comments.head())

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import pipeline

# ==========================================
# 1. Initialize Faster Zero-Shot Classifier
# ==========================================
classifier = pipeline(
    "zero-shot-classification",
    model="typeform/distilbert-base-uncased-mnli",  # Faster DistilBERT model
    device=0                                        # Use GPU if available
)

labels = ["skincare", "makeup", "fragrance", "haircare"]

# ==========================================
# 2. Batch Processing with Datasets + Checkpoints
# ==========================================
def classify_comments(
    df,
    text_column="textOriginal",
    batch_size=128,
    save_every=1000,   # Save every 1000 rows for efficiency
    checkpoint_dir="/content/drive/MyDrive/checkpoints/"
):
    # Create checkpoint folder if it doesnâ€™t exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_file = os.path.join(checkpoint_dir, "classified_comments.csv")

    # Drop empty/NaN comments first to save runtime
    df = df.copy()
    df = df[df[text_column].notna() & (df[text_column].str.strip() != "")]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # Resume from checkpoint if available
    if os.path.exists(checkpoint_file):
        print(f"Resuming from checkpoint: {checkpoint_file}")
        df_partial = pd.read_csv(checkpoint_file)
        start_idx = len(df_partial)
    else:
        df_partial = pd.DataFrame(columns=list(df.columns) + ["predicted_category", "confidence"])
        start_idx = 0

    # Define mapping function for classification
    def classify_batch(batch):
        preds = classifier(batch[text_column], candidate_labels=labels, truncation=True)
        batch["predicted_category"] = [p["labels"][0] for p in preds]
        batch["confidence"] = [p["scores"][0] for p in preds]
        return batch

    # Process dataset in batches
    for i in range(start_idx, len(dataset), batch_size):
        batch = dataset.select(range(i, min(i+batch_size, len(dataset))))
        batch_result = batch.map(classify_batch, batched=True, batch_size=batch_size)

        # Convert batch to DataFrame
        batch_df = batch_result.to_pandas()

        # Append to partial results
        df_partial = pd.concat([df_partial, batch_df], ignore_index=True)

        # Save checkpoint every N rows
        if (i // batch_size) % (save_every // batch_size) == 0:
            df_partial.to_csv(checkpoint_file, index=False)
            print(f"âœ… Checkpoint saved at {len(df_partial)} rows")

        if (i // batch_size) % 50 == 0:
            print(f"Processed {i+batch_size}/{len(dataset)} comments")

    # Final save
    df_partial.to_csv(checkpoint_file, index=False)
    print(f"ðŸŽ‰ Final results saved to {checkpoint_file}")

    return df_partial

# ==========================================
# 3. Run Classification
# ==========================================
result_df = classify_comments(
    df_all_comments,
    text_column="textOriginal",
    batch_size=128,
    save_every=5000,   # Save every ~5000 rows
    checkpoint_dir="/content/drive/MyDrive/checkpoints/"
)

# Preview results
print(result_df.head(10))
