In [1]:
import pandas as pd
import re
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Utility function to read JSONL files
def read_jsonl(file_path, nrows=None):
    return pd.read_json(file_path, lines=True, nrows=nrows)

# Utility function to preprocess the data
def preprocess_data(data, solution):
    merged = pd.merge(data, solution, on='indoml_id')
    merged['input_text'] = merged.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1)
    merged['target_text'] = merged.apply(lambda row: f"details_Brand: {row['details_Brand']} L0_category: {row['L0_category']} L1_category: {row['L1_category']} L2_category: {row['L2_category']} L3_category: {row['L3_category']} L4_category: {row['L4_category']}", axis=1)
    return merged[['input_text', 'target_text']]

# Load the data
train_data = read_jsonl('/kaggle/input/indoml-datathon-data/attribute_train.data')
train_solution = read_jsonl('/kaggle/input/indoml-datathon-data/attribute_train.solution')
test_data = read_jsonl('/kaggle/input/indoml-datathon-data/attribute_test.data')
val_data = read_jsonl('/kaggle/input/indoml-datathon-data/attribute_val.data')
val_solution = read_jsonl('/kaggle/input/indoml-datathon-data/attribute_val.solution')

# Preprocess the data
train_processed = preprocess_data(train_data, train_solution)
val_processed = preprocess_data(val_data, val_solution)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_processed)
val_dataset = Dataset.from_pandas(val_processed)
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [2]:
!ls /kaggle/input/tmp-generative-op1/fine_tuned_t5_1000dp/

  pid, fd = os.forkpty()


added_tokens.json	model.safetensors	 tokenizer_config.json
config.json		special_tokens_map.json
generation_config.json	spiece.model


In [3]:
from torch import nn

# # Initialize the model and tokenizer
# tokenizer = T5Tokenizer.from_pretrained('t5-small')
# model = T5ForConditionalGeneration.from_pretrained('t5-small')

model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/tmp-generative-op2/fine_tuned_t5_1000dp/')
tokenizer = T5Tokenizer.from_pretrained('/kaggle/input/tmp-generative-op2/fine_tuned_t5_1000dp/')

In [4]:
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/443499 [00:00<?, ? examples/s]

Map:   0%|          | 0/95035 [00:00<?, ? examples/s]

In [5]:
# Set up training arguments and custom callback
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=4,
    logging_dir='./logs',
    logging_steps=100,
    report_to='none'
)

class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

# Create and train the model using the Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()]
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.0026,0.0026
2,0.0017,0.002434
3,0.0012,0.00226
4,0.0009,0.001988
5,0.0005,0.002025


Step: 100
loss: 0.001
grad_norm: 0.004528792575001717
learning_rate: 0.001994227994227994
epoch: 0.01443001443001443


Step: 200
loss: 0.002
grad_norm: 0.018010210245847702
learning_rate: 0.0019884559884559886
epoch: 0.02886002886002886


Step: 300
loss: 0.0022
grad_norm: 0.021679753437638283
learning_rate: 0.0019826839826839826
epoch: 0.04329004329004329


Step: 400
loss: 0.0024
grad_norm: 0.012565764598548412
learning_rate: 0.001976911976911977
epoch: 0.05772005772005772


Step: 500
loss: 0.0025
grad_norm: 0.012734613381326199
learning_rate: 0.001971139971139971
epoch: 0.07215007215007214


Step: 600
loss: 0.0023
grad_norm: 0.0515560619533062
learning_rate: 0.0019653679653679657
epoch: 0.08658008658008658


Step: 700
loss: 0.0024
grad_norm: 0.037059228867292404
learning_rate: 0.0019595959595959597
epoch: 0.10101010101010101


Step: 800
loss: 0.0022
grad_norm: 0.01481799315661192
learning_rate: 0.0019538239538239538
epoch: 0.11544011544011544


Step: 900
loss: 0.0025
grad_norm: 0.0062

TrainOutput(global_step=34650, training_loss=0.0014088857105649566, metrics={'train_runtime': 21567.8275, 'train_samples_per_second': 102.815, 'train_steps_per_second': 1.607, 'total_flos': 7.502994201378816e+16, 'train_loss': 0.0014088857105649566, 'epoch': 5.0})

In [6]:
val_results = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
print(f"Validation Loss: {val_results['eval_loss']}")

Step: 34650
eval_loss: 0.0020253544207662344
eval_runtime: 297.6141
eval_samples_per_second: 319.323
eval_steps_per_second: 4.99
epoch: 5.0


Validation Loss: 0.0020253544207662344


In [7]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine_tuned_t5_1000dp')
tokenizer.save_pretrained('./fine_tuned_t5_1000dp')

('./fine_tuned_t5_1000dp/tokenizer_config.json',
 './fine_tuned_t5_1000dp/special_tokens_map.json',
 './fine_tuned_t5_1000dp/spiece.model',
 './fine_tuned_t5_1000dp/added_tokens.json')

In [8]:

# Define functions for prediction and text processing
def generate_text(inputs):
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

def extract_details(text):
    pattern = r'details_Brand: (.*?) L0_category: (.*?) L1_category: (.*?) L2_category: (.*?) L3_category: (.*?) L4_category: (.*)'
    match = re.match(pattern, text)
    if match:
        return tuple(item if item is not None else 'na' for item in match.groups())
    return 'na', 'na', 'na', 'na', 'na', 'na'

def clean_repeated_patterns(text):
    cleaned_data = text.split(' L4_category')[0] 
    return cleaned_data

# Define the function to evaluate predictions
def evaluate_predictions(generated_details, target_details):
    generated_dict = {i: [] for i in range(6)}
    target_dict = {i: [] for i in range(6)}

    for gen, tar in zip(generated_details, target_details):
        for i in range(6):
            generated_dict[i].append(gen[i])
            target_dict[i].append(tar[i])

    # Clean repeated patterns in L4_category
    generated_dict[5] = [clean_repeated_patterns(text) for text in generated_dict[5]]

    categories = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']
    metrics = ['accuracy', 'precision', 'recall', 'f1']

    results = {category: {metric: 0 for metric in metrics} for category in categories}

    for i, category in enumerate(categories):
        print('Current Category: ', category)
        y_true = target_dict[i]
        y_pred = generated_dict[i]
        
        results[category]['accuracy'] = accuracy_score(y_true, y_pred)
        results[category]['precision'] = precision_score(y_true, y_pred, average='macro', zero_division=0)
        results[category]['recall'] = recall_score(y_true, y_pred, average='macro', zero_division=0)
        results[category]['f1'] = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    print()

    for category, metrics in results.items():
        print(f"{category}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
        print()



In [9]:
device = 'cuda'

In [10]:
def preprocess_data(data):
    merged = {}
    return data.apply(lambda row: f"title: {row['title']} store: {row['store']} details_Manufacturer: {row['details_Manufacturer']}", axis=1).to_list()

test_dataset = preprocess_data(test_data)

In [11]:
batch_size = 32
generated_details = []

for i in tqdm(range(0, len(test_dataset), batch_size), desc="Processing test data"):
    batch_inputs = test_dataset[i:i+batch_size]
    
    generated_texts = generate_text(batch_inputs)
    
    for generated_text in generated_texts:
        generated_details.append(extract_details(generated_text))

print('Generated info extracted.............')

# Save the predictions to a DataFrame and then to a CSV file
# predictions_df = pd.DataFrame(generated_details, columns=['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category'])
# predictions_df['indoml_id'] = test_data['indoml_id'].iloc[:len(generated_details)]
# predictions_df.to_csv('test_predictions.csv', index=False)
# print("Predictions saved to 'test_predictions.csv'")

Processing test data: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2970/2970 [1:13:36<00:00,  1.49s/it]

Generated info extracted.............





In [12]:
import json
categories = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']

with open('attribute_test_Sneh.predict', 'w') as file:

    for indoml_id, details in enumerate(generated_details):
        result = {"indoml_id": indoml_id}
        for category, value in zip(categories, details):
            result[category] = value
        
        file.write(json.dumps(result) + '\n')