In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/indoml-phase2/train.features
/kaggle/input/indoml-phase2/final_test_data.features
/kaggle/input/indoml-phase2/train.labels
/kaggle/input/indoml-phase2/phase_2_test_set1.features


In [2]:
!pip install txtai

Collecting txtai
  Downloading txtai-7.5.0-py3-none-any.whl.metadata (28 kB)
Collecting faiss-cpu>=1.7.1.post2 (from txtai)
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading txtai-7.5.0-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.6/244.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu, txtai
Successfully installed faiss-cpu-1.9.0 txtai-7.5.0


In [26]:
def load_data(input_file, labels_file):
    input_data = pd.read_json(input_file, lines=True)
    labels_data = pd.read_json(labels_file, lines=True)
    
#     input_data = input_data[:10000]
#     labels_data = labels_data[:10000]
    
    # Merge the input and labels data on indoml_id
    merged_data = pd.merge(input_data, labels_data, on='indoml_id', how='inner')
    
    return merged_data

df = load_data('/kaggle/input/indoml-phase2/train.features', '/kaggle/input/indoml-phase2/train.labels')

In [27]:
df.head()

Unnamed: 0,indoml_id,description,retailer,price,supergroup,group,module,brand
0,0,1 adblue,organicorner,25.35,automotive,automotive detail unknown total,automotive,receipt all
1,1,1 car mat set,greenharbor,4.99,automotive,automotive detail unknown total,automotive,receipt all
2,2,1 cp rmx scrnwash,naturify,3.85,automotive,automotive detail unknown total,automotive,receipt all
3,3,1 diesel,ecogro,4.41,automotive,automotive detail unknown total,automotive,receipt all
4,4,1 unstoppable refrsher,greenharbor,3.0,automotive,automotive detail unknown total,automotive,receipt all


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, supergroup_train, supergroup_test, group_train, group_test, module_train, module_test, brand_train, brand_test = train_test_split(
    df['description'], df['supergroup'], df['group'], df['module'], df['brand'], test_size=0.20, random_state=42)

In [29]:
from txtai.embeddings import Embeddings
from tqdm import tqdm

# Initialize the embeddings model
embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2"})



In [30]:
def index_data(description):
    # Convert DataFrame into format suitable for txtai with tqdm progress bar
    documents = [(i, row, None) for i, row in tqdm(enumerate(description), total=len(description), desc="Indexing Data")]
    embeddings.index(documents)
    print("Data has been indexed successfully!")

# Function to search based on query
def search_class(query):
    # Search for the best match based on the query
    results = embeddings.search(query, 3)
    if results:
        idx = results[0][0]
        
        return df.iloc[idx]["supergroup"], df.iloc[idx]["group"], df.iloc[idx]["module"], df.iloc[idx]["brand"]
    return None

In [31]:
# Indexing the data
index_data(X_train)

Indexing Data: 100%|██████████| 449470/449470 [00:00<00:00, 933469.64it/s]


Data has been indexed successfully!


In [32]:
# Search query example
query = "m pitted black olive"
result = search_class(query)

if result:
    print(f"Class found: {result}")
else:
    print("No matching class found")

Class found: ('home do it yourself', 'home do it yourself detail unknown total', 'home do it yourself', 'receipt all')


In [33]:
# # Save the embeddings to a directory
# embeddings.save("path_to_save_embeddings")

In [34]:
# # Load the embeddings from the saved directory
# loaded_embeddings = Embeddings()
# loaded_embeddings.load("path_to_save_embeddings")

In [35]:
# test_df = pd.read_json('/kaggle/input/indoml-phase2/final_test_data.features',lines=True)

In [36]:
# for i in range(len(test_df)):
#     result = search_class(test_df.iloc[i]['description'])
#     print(result)

In [37]:
# def test(test_df):
#     # Initialize lists to store predictions
#     supergroups_list = []
#     groups_list = []
#     modules_list = []
#     brands_list = []

#     length_df = test_df.shape[0]
    
#     # Iterate through each row in the DataFrame
#     for i in range(length_df):
#         if i % 100 == 0:
#             print(f"Processing {i} of {length_df - 1}")
#         # Get predictions for the current row
#         predictions = search_class(test_df.iloc[i]['description'])
        
#         # Append predictions to respective lists
#         supergroups_list.append(predictions[0])
#         groups_list.append(predictions[1])
#         modules_list.append(predictions[2])
#         brands_list.append(predictions[3])

#     # Add predictions as new columns to the DataFrame
#     test_df['supergroup'] = supergroups_list
#     test_df['group'] = groups_list
#     test_df['module'] = modules_list
#     test_df['brand'] = brands_list
    
#     # Drop unnecessary columns
#     test_df.drop(columns=['description', 'retailer', 'price'], inplace=True)
    
#     # Save the modified DataFrame to a JSON file
#     test_df.to_json('predictions.predict', orient='records', lines=True)
    
#     print("Predictions saved to the predictions.predict file!")

In [38]:
# test(test_df)

In [39]:
def test(X_test, supergroup_test, group_test, module_test, brand_test):
    # Initialize lists to store predictions
    supergroups_list = []
    groups_list = []
    modules_list = []
    brands_list = []

    length_df = X_test.shape[0]
    
    # Iterate through each row in the Series
    for i in range(length_df):
        if i % 100 == 0:
            print(f"Processing {i} of {length_df - 1}")
        # Get predictions for the current description
        predictions = search_class(X_test.iloc[i])  # Access directly as it's a Series
        
        # Append predictions to respective lists
        supergroups_list.append(predictions[0])
        groups_list.append(predictions[1])
        modules_list.append(predictions[2])
        brands_list.append(predictions[3])

    # Convert lists to pandas Series for easy comparison with the test sets
    supergroup_pred = pd.Series(supergroups_list)
    group_pred = pd.Series(groups_list)
    module_pred = pd.Series(modules_list)
    brand_pred = pd.Series(brands_list)

    # Reset index to ensure comparison works
    supergroup_test = supergroup_test.reset_index(drop=True)
    group_test = group_test.reset_index(drop=True)
    module_test = module_test.reset_index(drop=True)
    brand_test = brand_test.reset_index(drop=True)

    # Calculate individual accuracies
    supergroup_accuracy = accuracy_score(supergroup_test, supergroup_pred)
    group_accuracy = accuracy_score(group_test, group_pred)
    module_accuracy = accuracy_score(module_test, module_pred)
    brand_accuracy = accuracy_score(brand_test, brand_pred)

    # Calculate item accuracy (all four must be correct)
    item_accuracy = accuracy_score(
        (supergroup_test == supergroup_pred) & 
        (group_test == group_pred) & 
        (module_test == module_pred) & 
        (brand_test == brand_pred), 
        [True] * len(supergroup_test)
    )

    # Calculate F1 scores for each class
    supergroup_f1 = f1_score(supergroup_test, supergroup_pred, average='weighted')
    group_f1 = f1_score(group_test, group_pred, average='weighted')
    module_f1 = f1_score(module_test, module_pred, average='weighted')
    brand_f1 = f1_score(brand_test, brand_pred, average='weighted')

    # Print accuracies and F1 scores
    print(f"Supergroup Accuracy: {supergroup_accuracy:.4f}, F1 Score: {supergroup_f1:.4f}")
    print(f"Group Accuracy: {group_accuracy:.4f}, F1 Score: {group_f1:.4f}")
    print(f"Module Accuracy: {module_accuracy:.4f}, F1 Score: {module_f1:.4f}")
    print(f"Brand Accuracy: {brand_accuracy:.4f}, F1 Score: {brand_f1:.4f}")
    print(f"Item Accuracy (all 4 correct): {item_accuracy:.4f}")

    # Save predictions to a DataFrame for output
    predictions_df = pd.DataFrame({
        'supergroup': supergroups_list,
        'group': groups_list,
        'module': modules_list,
        'brand': brands_list
    })
    
    # Concatenate with X_test for saving to JSON, if needed
    output_df = pd.concat([X_test.reset_index(drop=True), predictions_df], axis=1)

    # Save the modified DataFrame to a JSON file
    output_df.to_json('predictions.predict', orient='records', lines=True)
    
    print("Predictions saved to the predictions.predict file!")


In [40]:
test(X_test, supergroup_test, group_test, module_test, brand_test)

Processing 0 of 112367
Processing 100 of 112367
Processing 200 of 112367
Processing 300 of 112367
Processing 400 of 112367
Processing 500 of 112367
Processing 600 of 112367
Processing 700 of 112367
Processing 800 of 112367
Processing 900 of 112367
Processing 1000 of 112367
Processing 1100 of 112367
Processing 1200 of 112367
Processing 1300 of 112367
Processing 1400 of 112367
Processing 1500 of 112367
Processing 1600 of 112367
Processing 1700 of 112367
Processing 1800 of 112367
Processing 1900 of 112367
Processing 2000 of 112367
Processing 2100 of 112367
Processing 2200 of 112367
Processing 2300 of 112367
Processing 2400 of 112367
Processing 2500 of 112367
Processing 2600 of 112367
Processing 2700 of 112367
Processing 2800 of 112367
Processing 2900 of 112367
Processing 3000 of 112367
Processing 3100 of 112367
Processing 3200 of 112367
Processing 3300 of 112367
Processing 3400 of 112367
Processing 3500 of 112367
Processing 3600 of 112367
Processing 3700 of 112367
Processing 3800 of 11236

In [41]:
X_test

526511                        hotorsport news
119006    125 each growers harvest pure apple
247960         crispcorner chicken kormapilau
201914                   cp mayonnaise 1 down
136483                          ne pu mini sk
                         ...                 
275020                     british thole milk
97787          graze cocoa vanilla bar 4 30 g
446207                    nutmeg cotwool buds
337800                        us yum yjms x 4
130952                    invisibob sprunchie
Name: description, Length: 112368, dtype: object