In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/win25-stat-528-kaggle-competition-1/sample.csv
/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv
/kaggle/input/win25-stat-528-kaggle-competition-1/test.csv


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import time
from sklearn.metrics import precision_score, recall_score, f1_score  # Import evaluation metrics

# **Set device**
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# **Load dataset**
train_df = pd.read_csv('/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv')
test_df = pd.read_csv('/kaggle/input/win25-stat-528-kaggle-competition-1/test.csv')

# Preprocessing: Combine 'name', 'description', and 'ingredients'
train_df['text'] = train_df['description'].astype(str) + ' ' + train_df['ingredients'].astype(str)
test_df['text'] = test_df['description'].astype(str) + ' ' + test_df['ingredients'].astype(str)

# **Load ELECTRA tokenizer**
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

# **Dataset class for tokenization**
class RecipeDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, 
                                  max_length=self.max_length, return_tensors="pt")
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        if self.labels is not None:
            return input_ids, attention_mask, torch.tensor(self.labels[idx], dtype=torch.long)
        else:
            return input_ids, attention_mask

# **Prepare datasets**
train_labels = train_df['vegetarian'].values  # 1 = vegetarian, 0 = non-vegetarian
train_dataset = RecipeDataset(train_df['text'].tolist(), train_labels, tokenizer)
test_dataset = RecipeDataset(test_df['text'].tolist(), tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# **Define ELECTRA-based classification model**
class ElectraClassifier(nn.Module):
    def __init__(self):
        super(ElectraClassifier, self).__init__()
        self.electra = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)
    
    def forward(self, input_ids, attention_mask):
        return self.electra(input_ids=input_ids, attention_mask=attention_mask).logits

# **Initialize model, optimizer, loss function, and scheduler**
model = ElectraClassifier().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)  # Reduce LR by 10% every epoch
criterion = nn.CrossEntropyLoss()

# **Training loop**
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss, correct = 0, 0
    all_preds, all_labels = [], []  # Store predictions and labels for precision/recall calculations

    start_time = time.time()  # Track training time
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch")

    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (logits.argmax(1) == labels).sum().item()

        # Store predictions and labels
        all_preds.extend(logits.argmax(1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

    # Compute precision, recall, and F1-score
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')

    scheduler.step()  # Adjust learning rate

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Accuracy: {correct/len(train_loader.dataset):.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f} | Time: {epoch_time:.2f} sec | LR: {scheduler.get_last_lr()[0]:.6f}")

print("Training complete.")

# **Generate Predictions**
model.eval()
predictions = []
for input_ids, attention_mask in test_loader:
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        predictions.extend(logits.argmax(1).cpu().numpy())

# **Save Submission**
test_df['vegetarian'] = predictions
test_df[['id', 'vegetarian']].to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 258/258 [01:36<00:00,  2.67batch/s, loss=0.53]


Epoch 1 | Loss: 0.5301 | Accuracy: 0.7179 | Precision: 0.6933 | Recall: 0.7819 | F1-score: 0.7349 | Time: 96.60 sec | LR: 0.000027


Epoch 2/5: 100%|██████████| 258/258 [01:35<00:00,  2.69batch/s, loss=0.346]


Epoch 2 | Loss: 0.3462 | Accuracy: 0.8482 | Precision: 0.8023 | Recall: 0.9244 | F1-score: 0.8590 | Time: 95.97 sec | LR: 0.000024


Epoch 3/5: 100%|██████████| 258/258 [01:35<00:00,  2.69batch/s, loss=0.312]


Epoch 3 | Loss: 0.3115 | Accuracy: 0.8659 | Precision: 0.8280 | Recall: 0.9238 | F1-score: 0.8733 | Time: 95.99 sec | LR: 0.000022


Epoch 4/5: 100%|██████████| 258/258 [01:35<00:00,  2.69batch/s, loss=0.291]


Epoch 4 | Loss: 0.2906 | Accuracy: 0.8778 | Precision: 0.8443 | Recall: 0.9266 | F1-score: 0.8835 | Time: 95.97 sec | LR: 0.000020


Epoch 5/5: 100%|██████████| 258/258 [01:35<00:00,  2.69batch/s, loss=0.266]


Epoch 5 | Loss: 0.2655 | Accuracy: 0.8901 | Precision: 0.8598 | Recall: 0.9323 | F1-score: 0.8945 | Time: 95.90 sec | LR: 0.000018
Training complete.
Submission file saved as submission.csv
