In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/t5-small/tensorflow2/default/1/config.json
/kaggle/input/t5-small/tensorflow2/default/1/spiece.model
/kaggle/input/t5-small/tensorflow2/default/1/training_args.bin
/kaggle/input/t5-small/tensorflow2/default/1/tokenizer_config.json
/kaggle/input/t5-small/tensorflow2/default/1/model.safetensors
/kaggle/input/t5-small/tensorflow2/default/1/special_tokens_map.json
/kaggle/input/t5-small/tensorflow2/default/1/added_tokens.json
/kaggle/input/t5-small/tensorflow2/default/1/generation_config.json
/kaggle/input/attribute-value-prediction/attribute_train.data
/kaggle/input/attribute-value-prediction/attribute_val.solution
/kaggle/input/attribute-value-prediction/attribute_val.data
/kaggle/input/attribute-value-prediction/attribute_train.solution
/kaggle/input/attribute-value-prediction/attribute_test.data


In [2]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, DatasetDict
import numpy as np

In [4]:
def read_jsonl(file_path, nrows=None):
    df = pd.read_json(file_path, lines=True, nrows=nrows)
    if 'parent_asin' in df.columns:  # Ensure that the 'parent_asin' column is not used
        df = df.drop(columns=['parent_asin'])
    return df

# Load Training Data
train_data = read_jsonl('/kaggle/input/attribute-value-prediction/attribute_train.data',nrows=5000 )
train_solution = read_jsonl('/kaggle/input/attribute-value-prediction/attribute_train.solution',nrows=5000)

# Load Testing Data (No solution file is provided, so we comment it out)
test_data = read_jsonl('/kaggle/input/attribute-value-prediction/attribute_test.data',)
# test_solution = read_jsonl('./data/attrebute_test.solution', nrows=200)  # Comment this line out

# Load Validation Data
val_data = read_jsonl('/kaggle/input/attribute-value-prediction/attribute_val.data',nrows=1000)
val_solution = read_jsonl('/kaggle/input/attribute-value-prediction/attribute_val.solution',nrows=1000)

In [5]:
import pandas as pd
from datasets import Dataset

# Function to preprocess the data
def preprocess_data(data, solution):
    # Merge the data with the corresponding solution on 'indoml_id'
    merged = pd.merge(data, solution, on='indoml_id')

    # Create input_text by combining title, store, and manufacturer details
    merged['input_text'] = merged.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1)

    # Create target_text by combining brand and category details
    merged['target_text'] = merged.apply(lambda row: f"details_Brand: {row['details_Brand']} L0_category: {row['L0_category']} L1_category: {row['L1_category']} L2_category: {row['L2_category']} L3_category: {row['L3_category']} L4_category: {row['L4_category']}", axis=1)

    # Return the processed data with input_text and target_text columns
    return merged[['input_text', 'target_text']]

# Apply preprocessing to training and validation data
train_processed = preprocess_data(train_data, train_solution)
val_processed = preprocess_data(val_data, val_solution)

# Since there's no test_solution, we do not process the test data for now
# test_processed = preprocess_data(test_data, test_solution)

# Convert the processed pandas DataFrames into Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_processed)
val_dataset = Dataset.from_pandas(val_processed)

# Note: Test dataset is not processed as there's no test solution
# test_dataset = Dataset.from_pandas(test_processed)  # Commented out since no test_solution

In [6]:
def preprocess_data_no_solution(data):
    # Prepare input_text for test dataset, without target_text
    data['input_text'] = data.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1)
    return data[['input_text']]

# Preprocess the test data
test_processed = preprocess_data_no_solution(test_data)

# Convert to Hugging Face Dataset format
test_dataset = Dataset.from_pandas(test_processed)

In [7]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    #'test': test_dataset,
    'validation': val_dataset
})

In [8]:
tokenizer = T5Tokenizer.from_pretrained('/kaggle/input/t5-small/tensorflow2/default/1')
model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/t5-small/tensorflow2/default/1')

In [9]:
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']

    # Tokenize input texts
    model_inputs = tokenizer(inputs, max_length=352, padding='max_length', truncation=True)

    # Tokenize target texts (labels)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    # Add labels to model inputs
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Apply preprocessing to the training and validation datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
def preprocess_test_function(examples):
    inputs = examples['input_text']
    # Tokenize input texts
    model_inputs = tokenizer(inputs, max_length=352, padding='max_length', truncation=True)
    # Provide default labels or placeholders for test data
    model_inputs['labels'] = [-100] * len(model_inputs['input_ids'])
    return model_inputs

# Apply preprocessing to the test dataset
tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True)

Map:   0%|          | 0/95036 [00:00<?, ? examples/s]

In [11]:
combined_dataset_dict = DatasetDict({
    'train': tokenized_datasets['train'],
    'test': tokenized_test_dataset,
    'validation': tokenized_datasets['validation']
})

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir='./logs',
    logging_steps=800,
    report_to='none'
)

In [13]:
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()]
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.098507
2,0.188900,0.071038


Step: 625
eval_loss: 0.09850655496120453
eval_runtime: 263.7436
eval_samples_per_second: 3.792
eval_steps_per_second: 0.474
epoch: 1.0


Step: 800
loss: 0.1889
grad_norm: 0.34773051738739014
learning_rate: 0.0007199999999999999
epoch: 1.28


Step: 1250
eval_loss: 0.07103800028562546
eval_runtime: 265.5944
eval_samples_per_second: 3.765
eval_steps_per_second: 0.471
epoch: 2.0


Step: 1250
train_runtime: 10649.3306
train_samples_per_second: 0.939
train_steps_per_second: 0.117
total_flos: 930474885120000.0
train_loss: 0.14970188903808593
epoch: 2.0




TrainOutput(global_step=1250, training_loss=0.14970188903808593, metrics={'train_runtime': 10649.3306, 'train_samples_per_second': 0.939, 'train_steps_per_second': 0.117, 'total_flos': 930474885120000.0, 'train_loss': 0.14970188903808593, 'epoch': 2.0})

In [15]:
model.save_pretrained('./fine_tuned_t5_1000dp')
tokenizer.save_pretrained('./fine_tuned_t5_1000dp')

('./fine_tuned_t5_1000dp/tokenizer_config.json',
 './fine_tuned_t5_1000dp/special_tokens_map.json',
 './fine_tuned_t5_1000dp/spiece.model',
 './fine_tuned_t5_1000dp/added_tokens.json')

In [16]:
import re
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm

# Set device to GPU (cuda) if available, otherwise CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the fine-tuned model and tokenizer from the directory
model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_1000dp').to(device)
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_1000dp')

# Set the model to evaluation mode
model.eval()

# Assume you have test_data available, which contains the 'input_text'
# If you have already defined test_dataset, we extract test_data from it
test_data = test_dataset[:]  # Test data to generate predictions

# Function to generate text from the input data
def generate_text(inputs):
    # Tokenize and prepare input tensors
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text predictions without calculating gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)

    # Decode the generated token IDs back into text
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

# Function to extract details from generated text using regular expressions
def extract_details(text):
    pattern = r'details_Brand: (.*?) L0_category: (.*?) L1_category: (.*?) L2_category: (.*?) L3_category: (.*?) L4_category: (.*)'
    match = re.match(pattern, text)
    if match:
        return tuple(item if item is not None else 'na' for item in match.groups())
    return 'na', 'na', 'na', 'na', 'na', 'na'

# Function to clean repeated patterns in the generated text
def clean_repeated_patterns(text):
    cleaned_data = text.split(' L4_category')[0]
    return cleaned_data

In [17]:
from tqdm import tqdm

batch_size = 128
generated_details = []

# Loop over the test data in batches
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Processing test data"):
    # Select a batch of data
    batch_indices = list(range(i, min(i + batch_size, len(test_dataset))))
    batch_data = test_dataset.select(batch_indices)

    # Extract input texts from the batch
    batch_inputs = batch_data['input_text']

    # Generate predictions for the current batch
    generated_texts = generate_text(batch_inputs)

    # Extract and store details from the generated texts
    for generated_text in generated_texts:
        generated_details.append(extract_details(generated_text))

print('Generated info extracted.............')

Processing test data: 100%|██████████| 743/743 [5:31:05<00:00, 26.74s/it]  

Generated info extracted.............





In [18]:
import re
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import numpy as np

# Load the fine-tuned model and tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_1000dp').to(device)
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_1000dp')

# Set the model to evaluation mode
model.eval()

# Define a function to generate text
def generate_text(inputs):
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

# Define a function to extract details from generated text
def extract_details(text):
    pattern = r'details_Brand: (.*?) L0_category: (.*?) L1_category: (.*?) L2_category: (.*?) L3_category: (.*?) L4_category: (.*)'
    match = re.match(pattern, text)
    if match:
        return tuple(item if item else 'na' for item in match.groups())
    return ('na', 'na', 'na', 'na', 'na', 'na')

# Extract true categories from target texts
def extract_true_categories(target_text):
    pattern = r'details_Brand: (.*?) L0_category: (.*?) L1_category: (.*?) L2_category: (.*?) L3_category: (.*?) L4_category: (.*)'
    match = re.match(pattern, target_text)
    if match:
        return tuple(item if item else 'na' for item in match.groups())
    return ('na', 'na', 'na', 'na', 'na', 'na')

# Define a function to calculate metrics for each category
def calculate_metrics(predictions, targets):
    metrics = {}
    categories = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']
    category_indices = [0, 1, 2, 3, 4, 5]

    for idx, category in zip(category_indices, categories):
        pred_category = [pred[idx] for pred in predictions]
        true_category = [true[idx] for true in targets]

        # Print some examples for manual verification
        print(f"Category: {category}")
        print(f"Predictions: {pred_category[:10]}")
        print(f"True Labels: {true_category[:10]}")

        # Calculate metrics
        accuracy = accuracy_score(true_category, pred_category)
        precision = precision_score(true_category, pred_category, average='macro', zero_division=0)
        recall = recall_score(true_category, pred_category, average='macro', zero_division=0)
        f1 = f1_score(true_category, pred_category, average='macro', zero_division=0)

        metrics[category] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

    return metrics

# Main evaluation function
def evaluate_model(val_dataset):
    batch_size = 128
    predictions = []
    targets = []

    for i in range(0, len(val_dataset), batch_size):
        batch_data = val_dataset[i:i + batch_size]
        batch_inputs = batch_data['input_text']
        batch_targets = batch_data['target_text']

        # Generate predictions
        generated_texts = generate_text(batch_inputs)

        # Extract details from generated texts and targets
        batch_predictions = [extract_details(text) for text in generated_texts]
        batch_targets_extracted = [extract_true_categories(text) for text in batch_targets]

        predictions.extend(batch_predictions)
        targets.extend(batch_targets_extracted)

    # Calculate metrics
    metrics = calculate_metrics(predictions, targets)

    # Print metrics in the desired format
    for category, values in metrics.items():
        print(f"Current Category: {category}")
        print(f"  accuracy: {values['accuracy']:.4f}")
        print(f"  precision: {values['precision']:.4f}")
        print(f"  recall: {values['recall']:.4f}")
        print(f"  f1: {values['f1']:.4f}\n")

# Assuming val_dataset is a list of dictionaries with 'input_text' and 'target_text'
evaluate_model(val_dataset)


Category: details_Brand
Predictions: ['Pendleton', 'JP London', 'Lawn Fawn', 'ANCHEER', 'Schecter', 'ZOTAC', 'Winning Streak', 'Coverking', 'Panther Print', 'Old World Christmas']
True Labels: ['Pendleton', 'JP London', 'Lawn Fawn', 'ANCHEER', 'Schecter', 'ZOTAC', 'Winning Streak', 'Coverking', 'Panther Print', 'Old World Christmas']
Category: L0_category
Predictions: ['Home & Kitchen', 'Tools & Home Improvement', 'Patio, Lawn & Garden', 'Sports & Outdoors', 'Musical Instruments', 'Electronics', 'Sports & Outdoors', 'Automotive', 'Home & Kitchen', 'Home & Kitchen']
True Labels: ['Home & Kitchen', 'Tools & Home Improvement', 'Arts, Crafts & Sewing', 'Sports & Outdoors', 'Musical Instruments', 'Electronics', 'Sports & Outdoors', 'Automotive', 'Home & Kitchen', 'Home & Kitchen']
Category: L1_category
Predictions: ['Bedding', 'Electrical', 'Outdoor Dcor', 'Sports', 'Guitars', 'Computers & Accessories', 'Fan Shop', 'Interior Accessories', 'Wall Art', 'Home Dcor Products']
True Labels: ['Bed

In [19]:
import json

# Categories to save in the output file
categories = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']

# Open file for writing predictions in JSONL format
with open('attribute_validation_predictions.predict', 'w') as file:

    # Iterate through generated details and assign unique ID (indoml_id)
    for indoml_id, details in enumerate(generated_details):
        result = {"indoml_id": indoml_id}

        # Populate the result dictionary with predictions for each category
        for category, value in zip(categories, details):
            result[category] = value

        # Write each prediction as a JSON object, one per line
        file.write(json.dumps(result) + '\n')

print("Predictions saved to 'attribute_validation_predictions.predict'")

Predictions saved to 'attribute_validation_predictions.predict'
