In [58]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# import pycaret
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

In [40]:
import wandb
from wandb.integration.xgboost import WandbCallback


In [56]:
def get_metrics(y_true, y_pred):
    acc_score = accuracy_score(y_true, y_pred)
    prec_sc = precision_score(y_true, y_pred, average = None)
    rec_sc = recall_score(y_true, y_pred, average = None)
    print(f'Accuracy score: {acc_score}')
    print(f'Precision score: {prec_sc}')
    print(f'Recall score: {rec_sc}')
    return acc_score, prec_sc, rec_sc

In [2]:
df_1 = pd.read_csv('data/000000000001.csv')

In [3]:
df_1.head()

Unnamed: 0,product_id,title,description,brand,ebay_category,localized_aspects_json,google_category_l1,google_category_l2,google_category_l3,google_category_l4,google_category_l5
0,51d7b4c5b1276f2886e2c011d178c3fd7c8bd105,CHRISTY DORAN: PERSPECTIVES (CD.),"""**** BRAND NEW FACTORY SEALED CD ****\n\nDISP...",,Music|CDs,"[{""key"":""Artist"",""value"":""CHRISTY DORAN""},{""ke...",783,855.0,,,
1,100bc92b5f8da193fd911b2840e7e2f2f2506fb5,"Il Collo E La Collana/Libanon, Rayon, Audio CD...","""The Monster Bookshop eBay Store\n\nProduct De...",,Music|CDs,"[{""key"":""Artist"",""value"":""Rayon""},{""key"":""Era""...",783,855.0,,,
2,c4c5c5b53e623b2e63e3d9a33bf3320f95c2073f,SOULBOUND - OBSYDIAN - Preorder - New CD - N72z,"""ID72p\n\nNOTE: Release date: 26-July 2024\n\n...",,Music|CDs,"[{""key"":""Artist"",""value"":""SOULBOUND""},{""key"":""...",783,855.0,,,
3,a4116bc36fdb86a4d143355956431013895f80a6,John Farnham - Full House (CD Album),"""VPCD0843\nSee listing pictures for tracklisti...",,Music|CDs,"[{""key"":""Artist"",""value"":""John Farnham""},{""key...",783,855.0,,,
4,c69ecd7521c75e132b0fb99a6e80153857d23ed0,Wagner - Overtures - Wagner CD PAVG The Cheap ...,"""Can't find what you're looking for?\n\n--\n\n...",,Music|CDs,"[{""key"":""Album Name"",""value"":""Wagner - Overtur...",783,855.0,,,


In [4]:
nan_counts = df_1.isna().sum()

# Print the results in the desired format
for column, count in nan_counts.items():
    print(f"{column} : {count}")

product_id : 0
title : 0
description : 50155
brand : 22546
ebay_category : 0
localized_aspects_json : 6495
google_category_l1 : 0
google_category_l2 : 808
google_category_l3 : 17084
google_category_l4 : 45134
google_category_l5 : 85290


In [9]:
from transformers import BertTokenizer, BertModel
import torch

def preprocess_dataframe(dataframe):
    list_of_columns_to_drop = ['product_id', 'google_category_l3', 'google_category_l4', 'google_category_l5', 'localized_aspects_json']    
    
    dataframe = dataframe.drop(list_of_columns_to_drop, axis = 1)
    dataframe = dataframe.dropna(axis = 0)
    dataframe['google_category_l2'] = dataframe['google_category_l2'].astype(int)
    dataframe_y1 = dataframe[['title', 'description', 'brand', 'ebay_category','google_category_l1']]
    dataframe_y2 = dataframe[['title', 'description', 'brand', 'ebay_category', 'google_category_l2']]
    dataframe_y1.reset_index(inplace = True, drop = True)
    dataframe_y2.reset_index(inplace = True, drop = True)
    return dataframe_y1, dataframe_y2


In [10]:
df_1_preproc_1, df_1_preproc_2 = preprocess_dataframe(df_1)

In [11]:
labels_1 = df_1_preproc_1['google_category_l1'].to_numpy()
labels_2 = df_1_preproc_2['google_category_l2'].to_numpy()

In [12]:
titles_l1 = list(df_1_preproc_1['title'].to_numpy())

In [13]:
description_l1 = list(df_1_preproc_1['description'].to_numpy())

In [14]:
import re
def clean_string(input_string):
    """
    This function takes a string as input and returns a cleaned string containing
    only alphanumeric characters (letters and numbers).
    
    Parameters:
    input_string (str): The string to be cleaned.
    
    Returns:
    str: The cleaned string with only alphanumeric characters.
    """
    # Use regular expression to remove non-alphanumeric characters
    cleaned_string = re.sub(r'[^a-zA-Z0-9]', ' ', input_string)
    return cleaned_string

In [15]:
description_l1_cleaned = []
for item in description_l1:
    description_l1_cleaned.append(clean_string(item))

In [20]:

label_encoder = LabelEncoder()
df_1_preproc_1['ebay_category_int'] = label_encoder.fit_transform(df_1_preproc_1['ebay_category'])

In [21]:
category_feature = df_1_preproc_1['ebay_category_int'].to_numpy()

In [24]:
desc_l1_cleaned_embeddings = model.encode(description_l1_cleaned)
desc_l1_cleaned_embeddings.shape

(30287, 384)

In [25]:
titles_l1_embeddings = model.encode(titles_l1)
titles_l1_embeddings.shape

(30287, 384)

In [26]:
brand_l1  = list(df_1_preproc_1['brand'].to_numpy())
brand_l1_embeddings = model.encode(brand_l1)
brand_l1_embeddings.shape

(30287, 384)

In [27]:
category_feature_ = np.expand_dims(category_feature, axis = 1)
category_feature_.shape

(30287, 1)

In [28]:
X = np.hstack((titles_l1_embeddings, brand_l1_embeddings, category_feature_))

In [57]:
X_desc = np.hstack((desc_l1_cleaned_embeddings, titles_l1_embeddings, brand_l1_embeddings, category_feature_))
X_desc.shape

(30287, 1153)

In [29]:
X.shape

(30287, 769)

In [42]:
import xgboost as xgb
model = xgb.XGBClassifier(callbacks=[WandbCallback(log_model=True)])

In [32]:
dict_for_label_conversion = {1:0, 772:1, 8:2, 141:3, 783:4, 536:5, 537:6, 922:7, 412:8, 166:9, 2092:10, 436:11, 5181:12, 469:13, 1239:14, 988:15, 222:16, 5605:17, 632:18, 111:19, 888:20}

In [33]:
labels_1_converted = []
labels_unique = list(set(labels_1))
for elem in labels_1:
    labels_1_converted.append(dict_for_label_conversion[elem])

In [34]:
def convert_labels(labels):
    dict_for_label_conversion = {1:0, 772:1, 8:2, 141:3, 783:4,
                                 536:5, 537:6, 922:7, 412:8,
                                 166:9, 2092:10, 436:11, 5181:12,
                                 469:13, 1239:14, 988:15, 222:16,
                                 5605:17, 632:18, 111:19, 888:20}

    labels_converted = []
    for elem in labels:
        labels_converted.append(dict_for_label_conversion[elem])
    return labels_converted
    

In [59]:
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_desc, labels_1_converted, test_size=0.2, random_state=42)

In [35]:

X_train, X_test, y_train, y_test = train_test_split(
    X, labels_1_converted, test_size=0.2, random_state=42)

In [60]:
wandb.init(name = 'desc|title|brand|category-1')
model.fit(X_train_d, y_train_d)
y_pred = model.predict(X_test_d)
acc, pr, rec = get_metrics(y_test_d, y_pred)


Accuracy score: 0.8715747771541763
Precision score: [0.94736842 0.         0.81107492 0.92682927 0.97674419 0.79403409
 0.90909091 0.76       1.         0.93497364 1.         0.90909091
 1.         0.8525641  0.76712329 0.75352113 0.88036117 0.
 0.79423077 0.7        0.90888806]
Recall score: [0.7826087  0.         0.778125   0.82608696 0.85714286 0.88170347
 0.55555556 0.44186047 0.5        0.93497364 0.5        0.6779661
 0.44       0.86363636 0.85714286 0.82945736 0.89861751 0.
 0.60291971 0.18181818 0.98667743]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NameError: name 'rc' is not defined

In [62]:
wandb.finish()

VBox(children=(Label(value='5.593 MB of 11.124 MB uploaded\r'), FloatProgress(value=0.502811361971073, max=1.0…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
accuracy,0.87157
epoch,99.0


In [43]:
# run = wandb.init(
#     # set the wandb project where this run will be logged
#     project="demo_test_task",
# )
model.fit(X_train, y_train)
wandb.finish()

VBox(children=(Label(value='8.318 MB of 11.486 MB uploaded\r'), FloatProgress(value=0.7242187741307452, max=1.…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,99


In [44]:
pred_y = model.predict(X_test)

In [45]:
print(pred_y)

[20 20  4 ...  5  9 20]


In [51]:
get_metrics(y_test, pred_y)

Accuracy score: 0.8697589963684385
Precision score: [0.9047619  0.         0.82154882 0.87804878 0.96183206 0.78303199
 0.8125     0.83333333 1.         0.93019197 0.         0.925
 0.77777778 0.8490566  0.79716981 0.75342466 0.87586207 0.
 0.78007519 0.88888889 0.91014169]
Recall score: [0.82608696 0.         0.7625     0.7826087  0.85714286 0.88801262
 0.72222222 0.34883721 0.5625     0.93673111 0.         0.62711864
 0.28       0.87662338 0.8622449  0.85271318 0.87788018 0.
 0.60583942 0.20779221 0.98546629]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Error: You must call wandb.init() before wandb.log()