In [None]:
from tqdm.notebook import tqdm
import os
import pandas as pd
import torch
import numpy as np
import argparse
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import csv


all_labels = [
    'cleanliness', 'cozy_atmosphere', 'delicious_food', 'dirty', 'fresh_ingredients', 'friendly_staff', 'good_value', 'low_quality_ingredients', 'negative', 'noisy_environment', 'overcooked', 'overpriced', 'poor_taste', 'positive', 'professional_service', 'rude_staff', 'slow_service', 'spoiled', 'unhygienic', 'unprofessional_service'
]

if torch.cuda.is_available():
    print("GPU will be used")

# Modell auf CUDA, wenn vorhanden
device = "cuda" if torch.cuda.is_available() else "cpu"

print(" Verwende Gerät:", device)

#  Funktion: Vorhersage für langen Text mit Chunking
def predict_labels_for_long_text(text, chunk_size=256):
    #  Tokenisieren (ohne Abschneiden)
    encoded = tokenizer(text, return_tensors="pt", truncation=False)
    input_ids = encoded["input_ids"][0]

    #  In Chunks aufteilen
    all_preds = []
    for i in range(0, len(input_ids), chunk_size):
        chunk_ids = input_ids[i:i+chunk_size]
        chunk_inputs = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        
        inputs = tokenizer(chunk_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=chunk_size)
        inputs = {k: v.to(device) for k, v in inputs.items()}  # Auf GPU verschieben

        # inputs = tokenizer(chunk_inputs, return_tensors="pt", truncation=True, padding="max_length", max_length=chunk_size)

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.sigmoid(logits).cpu().numpy()[0]
            preds = (probs > 0.75).astype(int)
            all_preds.append(preds)

    # Aggregation: wenn ein Label in einem Chunk vorkommt, nehmen wir es insgesamt
    final_preds = np.max(np.array(all_preds), axis=0)
    predicted_labels = [label for label, flag in zip(all_labels, final_preds) if flag == 1]
    
    return predicted_labels

#  Hauptfunktion – liest die richtige Datei basierend auf Start & End
def main():
    #  Eingabe & Ausgabe-Dateien anhand der Parameter
    input_file = f"/kaggle/input/dish-dash/reviews/reviews.csv"
    output_file = f"/kaggle/working/review_label.csv"

    # Modell & Tokenizer laden
    global tokenizer, model
    MODEL_PATH = "/kaggle/input/dish-dash/final_model_70k/final_model_70k"



# Modell laden mit CUDA
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

    #  CSV-Datei komplett laden
    df = pd.read_csv(input_file)

    processed_ids = set()
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        processed_ids = set(existing_df['review_id'].astype(str).tolist())
        print(f" Already processed: {len(processed_ids)} reviews")

    

    total = len(df)
    # Create two progress bars
    progress_bar = tqdm(total=total, desc="Processed")
    skipped_bar = tqdm(total=0, desc="Skipped", position=1)
    
    df = df[df['review_id'].isin(processed_ids)==False]
    progress_bar.update(len(processed_ids))

    #  CSV-Ausgabe initialisieren
    with open(output_file, mode='a' if os.path.exists(output_file) else 'w', newline='', encoding='utf-8') as out_csv:
        writer = csv.writer(out_csv)
        if not os.path.exists(output_file):  # Header nur einmal schreiben
            writer.writerow(['review_id', 'label'])

        for _, row in df.iterrows():
            review_id = row['review_id']
            text = row['text']

            labels = predict_labels_for_long_text(text)

            #  Jede Label-Zeile separat speichern
            for label in labels:
                writer.writerow([review_id, label])
            
            if len(labels)==0:
                skipped_bar.total += 1
                skipped_bar.update(1)
                skipped_bar.refresh()
                
            progress_bar.update(1)

    progress_bar.close()
    skipped_bar.close()
main()

🔁 Already processed: 3425524 reviews


Processed:   0%|          | 0/3431734 [00:00<?, ?it/s]

Skipped: 0it [00:00, ?it/s]