In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/win25-stat-528-kaggle-competition-1/sample.csv
/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv
/kaggle/input/win25-stat-528-kaggle-competition-1/test.csv


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score  # Import metrics
import torch.nn.functional as F

# Enable GPU acceleration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1337)

# Hyperparameters
BATCH_SIZE = 32  
MAX_LENGTH_DESC = 128
MAX_LENGTH_INGR = 64
LEARNING_RATE = 0.00003
EPOCHS = 6
CLIP_GRAD_NORM = 1.0
USE_MIXED_PRECISION = True  

# Load Data
train_df = pd.read_csv('/kaggle/input/win25-stat-528-kaggle-competition-1/train.csv')
test_df = pd.read_csv('/kaggle/input/win25-stat-528-kaggle-competition-1/test.csv')

# Fill missing values
train_df['description'] = train_df['description'].fillna('')
train_df['ingredients'] = train_df['ingredients'].fillna('')
test_df['description'] = test_df['description'].fillna('')
test_df['ingredients'] = test_df['ingredients'].fillna('')

# Initialize BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# TF-IDF Feature Extraction for Logistic Regression Model
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_df['description'] + ' ' + train_df['ingredients'])
y_train = train_df['vegetarian']

# Train Logistic Regression Model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tfidf, y_train)

# Custom Dataset for Transformer Model
class RecipeDataset(Dataset):
    def __init__(self, descriptions, ingredients, labels=None, tokenizer=None):
        self.descriptions = descriptions
        self.ingredients = ingredients
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        desc_encoding = self.tokenizer(self.descriptions[idx], padding='max_length', truncation=True,
                                       max_length=MAX_LENGTH_DESC, return_tensors="pt")
        ingr_encoding = self.tokenizer(self.ingredients[idx], padding='max_length', truncation=True,
                                       max_length=MAX_LENGTH_INGR, return_tensors="pt")

        input_ids_desc = desc_encoding['input_ids'].squeeze(0)
        attention_mask_desc = desc_encoding['attention_mask'].squeeze(0)
        input_ids_ingr = ingr_encoding['input_ids'].squeeze(0)
        attention_mask_ingr = ingr_encoding['attention_mask'].squeeze(0)

        if self.labels is not None:
            return input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr, torch.tensor(self.labels[idx], dtype=torch.long)
        else:
            return input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr

# Prepare datasets
train_labels = train_df['vegetarian'].values
train_dataset = RecipeDataset(train_df['description'].tolist(), train_df['ingredients'].tolist(), train_labels, tokenizer)
test_dataset = RecipeDataset(test_df['description'].tolist(), test_df['ingredients'].tolist(), tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define Transformer-based classifier
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.multihead_attn = nn.MultiheadAttention(embed_dim=768, num_heads=4, batch_first=True)
        self.fc = nn.Linear(768, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr):
        desc_output = self.bert(input_ids=input_ids_desc, attention_mask=attention_mask_desc).last_hidden_state[:, 0, :]
        ingr_output = self.bert(input_ids=input_ids_ingr, attention_mask=attention_mask_ingr).last_hidden_state[:, 0, :]

        attn_output, _ = self.multihead_attn(desc_output.unsqueeze(1), ingr_output.unsqueeze(1), ingr_output.unsqueeze(1))
        combined_output = attn_output.squeeze(1) + desc_output + ingr_output
        logits = self.fc(self.dropout(combined_output))
        return logits

# Initialize model and optimizer
model = BertClassifier().to(device)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler() if USE_MIXED_PRECISION else None

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss, correct = 0, 0
    all_preds, all_labels = [], []  
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in progress_bar:
        input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()

        if USE_MIXED_PRECISION:
            with torch.cuda.amp.autocast():
                logits = model(input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_GRAD_NORM)
            optimizer.step()

        total_loss += loss.item()
        correct += (logits.argmax(1) == labels).sum().item()
        all_preds.extend(logits.argmax(1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    precision = precision_score(all_labels, all_preds, average="binary")
    recall = recall_score(all_labels, all_preds, average="binary")
    f1 = f1_score(all_labels, all_preds, average="binary")
    accuracy = correct / len(train_loader.dataset)

    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f}")

print("Training complete.")

# Generate predictions from Transformer model
model.eval()
transformer_probs = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting with Transformer"):
        input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr = [b.to(device) for b in batch]
        logits = model(input_ids_desc, attention_mask_desc, input_ids_ingr, attention_mask_ingr)
        probs = F.softmax(logits, dim=1)
        transformer_probs.append(probs.cpu().numpy())
transformer_probs = np.concatenate(transformer_probs, axis=0)

# Get logistic regression probabilities
X_test_tfidf = vectorizer.transform(test_df['description'] + ' ' + test_df['ingredients'])
logistic_probs = logistic_model.predict_proba(X_test_tfidf)

# Ensemble predictions
alpha = 0.8  
ensemble_probs = alpha * transformer_probs + (1 - alpha) * logistic_probs
final_preds = ensemble_probs.argmax(axis=1)

# Save results
test_df['vegetarian'] = final_preds
test_df[['id', 'vegetarian']].to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler() if USE_MIXED_PRECISION else None
  with torch.cuda.amp.autocast():
Epoch 1/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.2862]


Epoch 1 | Loss: 0.4390 | Accuracy: 0.7835 | Precision: 0.7598 | Recall: 0.8292 | F1-score: 0.7930


  with torch.cuda.amp.autocast():
Epoch 2/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.4748]


Epoch 2 | Loss: 0.3333 | Accuracy: 0.8500 | Precision: 0.8160 | Recall: 0.9039 | F1-score: 0.8577


  with torch.cuda.amp.autocast():
Epoch 3/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.5883]


Epoch 3 | Loss: 0.2941 | Accuracy: 0.8701 | Precision: 0.8410 | Recall: 0.9130 | F1-score: 0.8755


  with torch.cuda.amp.autocast():
Epoch 4/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.2865]


Epoch 4 | Loss: 0.2537 | Accuracy: 0.8899 | Precision: 0.8672 | Recall: 0.9210 | F1-score: 0.8933


  with torch.cuda.amp.autocast():
Epoch 5/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.0753]


Epoch 5 | Loss: 0.1766 | Accuracy: 0.9281 | Precision: 0.9151 | Recall: 0.9439 | F1-score: 0.9293


  with torch.cuda.amp.autocast():
Epoch 6/6: 100%|██████████| 516/516 [04:38<00:00,  1.85it/s, loss=0.2497]


Epoch 6 | Loss: 0.1400 | Accuracy: 0.9435 | Precision: 0.9333 | Recall: 0.9553 | F1-score: 0.9442
Training complete.


Predicting with Transformer: 100%|██████████| 172/172 [00:30<00:00,  5.56it/s]


Submission file saved as 'submission.csv'
