In [1]:
import torch
import gc
import cupy as cp
from cuml import SVC
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import joblib
import json
import numpy as np
import pandas as pd

In [2]:
def load_svm_model(filename):
    """Load the SVM model from a file."""
    svm = joblib.load(filename)
    print(f"Model loaded from {filename}")
    return svm

def infer_with_svm(svm, test_data):
    """Perform inference using the loaded SVM model."""
    # Convert test_data to a CuPy array if it's not already
    if not isinstance(test_data, cp.ndarray):
        test_data = cp.array(test_data)

    # Make predictions
    predictions = svm.predict(test_data)

    return cp.asnumpy(predictions)  # Convert predictions back to NumPy array if needed

# Load the model
svm_model = load_svm_model("/kaggle/input/indoml-tmp-svm/svm_model_supergroup.pkl")  # Replace 'some_feature' with the actual feature name

Model loaded from /kaggle/input/indoml-tmp-svm/svm_model_supergroup.pkl


In [3]:
# Load data
def load_data(input_file, labels_file):
    input_data = pd.read_json(input_file, lines=True)
    labels_data = pd.read_json(labels_file, lines=True)
    
#     input_data = input_data[:100000]
#     labels_data = labels_data[:100000]
    
    # Merge the input and labels data on indoml_id
    merged_data = pd.merge(input_data, labels_data, on='indoml_id', how='inner')
    
    return merged_data

In [4]:
data = load_data('/kaggle/input/indoml-phase2/train.features', '/kaggle/input/indoml-phase2/train.labels')

In [5]:
def load_test_data(input_file):
    input_data = pd.read_json(input_file, lines=True)
#     input_data = input_data[:1000]

    return input_data

test_data = load_test_data('/kaggle/input/indoml-phase2/phase_2_test_set1.features')

In [6]:
def generate_embeddings(texts, max_length=16, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', max_length=max_length, 
                           truncation=True, padding='max_length')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    del tokenizer
    del model
    gc.collect()
    
    return cp.array(embeddings)  # Convert to cupy array for GPU operations

In [7]:
tmp = generate_embeddings(["1 adblue"])

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
def encode_labels(df):
    label_encoders = {}
    encoded_labels = {}
    for column in ['supergroup', 'group', 'module', 'brand']:
        le = LabelEncoder()
        # Check if the column contains valid data
        if df[column].dtype == 'object':
            df[column].replace('', np.nan, inplace=True)
            df[column].dropna(inplace=True)
        
        encoded_labels[column] = cp.array(le.fit_transform(df[column].astype(str)))  # Ensure all data is string before encoding
        label_encoders[column] = le
    return encoded_labels, label_encoders

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encoded_labels, label_encoders = encode_labels(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [11]:
# Example test data (replace this with your actual test data)

# Perform inference
predictions = infer_with_svm(svm_model, tmp)

print("Predictions:", predictions)

Predictions: [12]


In [12]:
tmp2 = label_encoders['supergroup'].inverse_transform(encoded_labels["supergroup"].get()).tolist()

In [13]:
test_embeddings = generate_embeddings(test_data['description'].tolist())

Using device: cuda




In [14]:
test_embeddings.shape

(85462, 768)

In [15]:
import cupy as cp

def infer_with_svm(svm, test_data):
    """Perform inference using the loaded SVM model."""
    import cupy as cp

    # Ensure test_data is 2D with shape (n_samples, n_features)
    if len(test_data.shape) != 2:
        raise ValueError(f"Expected test_data to be 2D, got shape {test_data.shape}")

    # Convert test_data to CuPy array if it's not already
    if not isinstance(test_data, cp.ndarray):
        test_data = cp.array(test_data)

    # Ensure the shape is correct
    print(f"test_data shape before prediction: {test_data.shape}")

    # Convert to NumPy before SVM prediction
    predictions = svm.predict(cp.asnumpy(test_data))

    # Return predictions as a NumPy array or CuPy array based on preference
    return cp.array(predictions)

In [16]:
predictions = infer_with_svm(svm_model, test_embeddings)

predictions

test_data shape before prediction: (85462, 768)


array([12, 12, 12, ..., 12, 12, 12])

In [17]:
predictions.shape

(85462,)

In [18]:
unique_classes = np.unique(predictions)
num_unique_classes = len(unique_classes)
print(unique_classes, num_unique_classes)

[ 5  6 10 12] 4


In [19]:
predictions

array([12, 12, 12, ..., 12, 12, 12])

In [20]:
import cupy as cp

def inverse_transform_labels(encoded_labels, label_encoders, key):
    """Inverse transform encoded labels using the label encoder from a dictionary."""
    # Get the correct label encoder from the dictionary
    label_encoder = label_encoders.get(key)
    
    if label_encoder is None:
        raise ValueError(f"No label encoder found for key: {key}")
    
    # Convert CuPy array to NumPy array if encoded_labels is a CuPy array
    if isinstance(encoded_labels, cp.ndarray):
        encoded_labels = encoded_labels.get()  # Explicit conversion from CuPy to NumPy

    # Convert the encoded labels to a list of original labels
    return label_encoder.inverse_transform(encoded_labels).tolist()

# Assuming 'key' is the appropriate key for selecting the right label encoder
# key = 'supergroup'  # Replace with the appropriate key
# tmp = inverse_transform_labels(predictions, label_encoders, key)

In [21]:
import json

def save_transformed_labels(predictions, label_encoders, key, predictions_file, output_file="output.json"):
    """Transform predictions to original labels and save them in the specified format."""
    # Inverse transform predictions using the label encoder
    transformed_labels = inverse_transform_labels(predictions, label_encoders, key)

    # Load existing predictions to retain the other values
    with open(predictions_file, 'r') as f:
        existing_data = [json.loads(line) for line in f]  # Load each line as a JSON object
    
    # Create a list of dictionaries in the specified format
    data_to_save = []
    for idx, original_entry in enumerate(existing_data):
        entry = {
            "indoml_id": idx,
            "supergroup": transformed_labels[idx],  # Use transformed label as 'supergroup'
            "group": original_entry.get("group", "automotive detail unknown total"),  # Keep original group
            "module": original_entry.get("module", "automotive"),  # Keep original module
            "brand": original_entry.get("brand", "receipt all")  # Keep original brand
        }
        data_to_save.append(entry)
    
    # Save the modified data to a JSON file
    with open(output_file, 'w') as f:
        for entry in data_to_save:
            json.dump(entry, f)
            f.write('\n')  # Write each dictionary on a new line

# Example usage
key = 'supergroup'  # Replace with the actual key for the label encoder
predictions_file = "/kaggle/input/indoml-phase-2-predictions/preidcitions.predict"  # The path to the original predictions JSON file
save_transformed_labels(predictions, label_encoders, key, predictions_file, output_file="output.predict")