# For Top Category ID

## Importing Libraries - Start

In [None]:
import pandas as pd
import numpy as np
import re
import os

### nltk libraries

In [None]:
!pip install nltk
import nltk
from nltk.corpus import words

# Download the 'words' corpus if not already downloaded
nltk.download('words')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup



### Libraries or statistics

In [None]:
from scipy.stats import chi2_contingency
from scipy.stats import ks_2samp

### Libraries for model training

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score,make_scorer
from tqdm import tqdm
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

## Importing Dataset - Start

In [None]:
PATH = f"/kaggle/input/etsy-dataset/data/2024"

In [None]:
parquet_filenames_train = os.listdir("/kaggle/input/etsy-dataset/data/2024/train")
parquet_filenames_test = os.listdir("/kaggle/input/etsy-dataset/data/2024/test")

In [None]:
def read_data(folder, filenames):
    """ Read parquet data from mutiple files """

    dataframes = [
        pd.read_parquet(f"{PATH}/{folder}/{filename}")
        for filename in filenames
    ]

    return pd.concat(dataframes)

In [None]:
%%time

df_train = read_data("train", parquet_filenames_train[:])
df_test = read_data("test", parquet_filenames_test[:])

CPU times: user 23.5 s, sys: 48.1 s, total: 1min 11s
Wall time: 3min 30s


## Exploring the Dataset - Start

In [None]:
len(df_train)

229624

In [None]:
df_train.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
0,1278122912,Dragon and Snake Custom Nike AF1 Custom Chines...,Dragon and Snake Custom Nike AF1 Custom Chines...,"handpainted sneaker,sneaker,unique gift,custom...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,1,black,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,596
1,698470598,"Canvas Shoes Trainers, Kids to Adults, Hand C...","Fantastic Gold Edged Butterfly Fabric, yellow ...","Butterflies,Butterfly shoes,Butterfly presents...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,18,yellow,13,rainbow,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,427
2,1437216455,Sneaker Keychain,Ideal gift for exclusive sneaker lovers\n\nThe...,,physical,,,,,anniversary,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,14,red,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,380
3,1502633808,Adidas Superstar Custom Sneakers Rose Shoes Bl...,Adidas Superstar Custom Sneakers Rose\n\n· Pai...,"Superstar Custom,Air Force 1 Rose,Rose Custom ...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,1,black,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
4,1474730171,"Madras Shoes for Men, Preppy Madras Patchwork ...",Looking for a classic madras shoe for men? Her...,"madras shoes,men's madras shoes,lace-up canvas...",physical,,,,,,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,2,blue,9,green,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570


In [None]:
df_train=df_train.replace('', np.nan)

In [None]:
df_test=df_test.replace('', np.nan)

Get Missing Values in DF Columns

In [None]:
df_train.isna().sum()

product_id                   0
title                      810
description                810
tags                     31169
type                       348
room                    220686
craft_type              201862
recipient               218815
material                209384
occasion                171550
holiday                 183359
art_subject             226885
style                   213730
shape                   227137
pattern                 218042
bottom_category_id           0
bottom_category_text         0
top_category_id              0
top_category_text            0
primary_color_id             0
primary_color_text           0
secondary_color_id           0
secondary_color_text         0
image/encoded                0
image/width                  0
image/height                 0
dtype: int64

As we can see for columns room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape, pattern the missing values is very high

## Sampling the Dataset For top Category ID - Start

Since the dataset is huge it will impact the computation and resources. In order to train the dataset efficiently we will sample the dataset accordingly. Let check the percentage distribution or top_category_id

In [None]:
df_train['top_category_id'].value_counts()

top_category_id
8     54600
6     33393
5     30143
13    13835
0     12416
10    12207
3     11327
14    10143
12     8892
1      8716
9      7822
7      7474
2      6650
4      6262
11     5744
Name: count, dtype: int64

In [None]:
df_train['top_category_id'].value_counts(normalize=True) * 100

top_category_id
8     23.778002
6     14.542469
5     13.127112
13     6.025067
0      5.407100
10     5.316082
3      4.932847
14     4.417221
12     3.872418
1      3.795770
9      3.406438
7      3.254886
2      2.896039
4      2.727067
11     2.501481
Name: proportion, dtype: float64

Now since we have percentage distribution lets do sampling for sample data length calculation Using the link https://www.calculator.net/sample-size-calculator.html received the sample size as 15517

![image.png](attachment:ec55b84b-4083-4280-a1e0-8cb53c449e1c.png)

Using Stratified sampling

In [None]:
sample_size=15517

In [None]:
samples_per_category = (df_train.groupby('top_category_id').size() / len(df_train) * sample_size).astype(int)
def stratified_sampling(group):
    return group.sample(samples_per_category[group.name])

sample_df1 = df_train.groupby('top_category_id', group_keys=False).apply(stratified_sampling)
sample_df1 = sample_df1.sample(frac=1).reset_index(drop=True)
sample_df1.reset_index(inplace=True)

  sample_df1 = df_train.groupby('top_category_id', group_keys=False).apply(stratified_sampling)


In [None]:
len(sample_df1)

15509

In [None]:
sample_df1['top_category_id'].value_counts()

top_category_id
8     3689
6     2256
5     2036
13     934
0      839
10     824
3      765
14     685
12     600
1      588
9      528
7      505
2      449
4      423
11     388
Name: count, dtype: int64

Now lets check statistically if sample df is equal to df_train using chi square test

In [None]:
original_counts = df_train['top_category_id'].value_counts()
sample_counts = sample_df1['top_category_id'].value_counts()

contingency_table = pd.concat([original_counts, sample_counts], axis=1, keys=['Original', 'Sample']).fillna(0)

chi2, p_value, _, _ = chi2_contingency(contingency_table)

print("Chi-square statistic:", chi2)
print("P-value:", p_value)

Chi-square statistic: 0.002760471378464443
P-value: 1.0


Checking Kolmogorov-Smirnov Test as well

In [None]:
ecdf_original = df_train['top_category_id'].value_counts(normalize=True).sort_index().cumsum()
ecdf_sample = sample_df1['top_category_id'].value_counts(normalize=True).sort_index().cumsum()

# Perform Kolmogorov-Smirnov Test
ks_statistic, p_value = ks_2samp(ecdf_original, ecdf_sample)

print("Kolmogorov-Smirnov Test Statistic:", ks_statistic)
print("P-value:", p_value)

Kolmogorov-Smirnov Test Statistic: 0.06666666666666667
P-value: 0.9999999999990623


  ks_statistic, p_value = ks_2samp(ecdf_original, ecdf_sample)


Now since the p-value is 1 we will accept the null hypothesis that is sample and population dataframes are the same

## Data Cleaning

Now lets perform data cleaning on our sampled_df to fit into machine learning model later on and predict the top_category_id

In [None]:
sample_df1.isna().sum()

index                       0
product_id                  0
title                      48
description                48
tags                     2125
type                       29
room                    14867
craft_type              13685
recipient               14786
material                14166
occasion                11495
holiday                 12315
art_subject             15334
style                   14435
shape                   15342
pattern                 14745
bottom_category_id          0
bottom_category_text        0
top_category_id             0
top_category_text           0
primary_color_id            0
primary_color_text          0
secondary_color_id          0
secondary_color_text        0
image/encoded               0
image/width                 0
image/height                0
dtype: int64

Lets consider columns which has less missing records like type

In [None]:
sample_df1[sample_df1['type'].isna()].head(4)

Unnamed: 0,index,product_id,title,description,tags,type,room,craft_type,recipient,material,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
1253,1253,893892857,Denim Cocktail Napkins (Set of 4) - Golden Pal...,Set of four 10” x 10” printed fabric cocktail ...,"Spoonflower Fabric,Party Decor,Custom Table Li...",,,,,cotton,...,home_and_living.kitchen_and_dining.linens.tabl...,8,home_and_living,2,blue,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
2416,2416,662514853,Clouds Placemats (Set of 2) - Mountain Mist by...,A unique set of 2 cloth placemats for special ...,"Spoonflower Cotton,Sateen Tablecloth,Printed T...",,,,,linen,...,home_and_living.kitchen_and_dining.linens.tabl...,8,home_and_living,2,blue,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
2439,2439,891581313,Standard/Leather/Jersey sewing machine needles...,Sewing machine needle assortment for various u...,,,,sewing,,,...,craft_supplies_and_tools.tools_and_equipment,6,craft_supplies_and_tools,0,beige,0,beige,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
2516,2516,1033855939,2 Artistically Faceted Multi-Hue Fluorite Nugg...,Vibrant Fluorite Nugget Beads - A Pair of Natu...,"fluorite bead,gemstone bead,striped fluorite,a...",,,jewelry making,,,...,craft_supplies_and_tools.beads_gems_and_caboch...,6,craft_supplies_and_tools,9,green,2,blue,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570


Lets check if index 298 and 300 have same types or not

In [None]:
sample_df1['type'].iloc[298]

'physical'

In [None]:
sample_df1['type'].iloc[300]

'physical'

Ohh both the values are same.Lets check another value 1438

In [None]:
sample_df1['type'].iloc[1437]

'physical'

In [None]:
sample_df1['type'].iloc[1439]

'physical'

Looks like the missing values are have adjecant values same.Lets do forward fill

In [None]:
sample_df1['type'] = sample_df1['type'].replace(np.nan, method='ffill')

  sample_df1['type'] = sample_df1['type'].replace(np.nan, method='ffill')


In [None]:
df_test['type'] = df_test['type'].replace(np.nan, method='ffill')

In [None]:
sample_df1.isna().sum()

index                       0
product_id                  0
title                      48
description                48
tags                     2125
type                        0
room                    14867
craft_type              13685
recipient               14786
material                14166
occasion                11495
holiday                 12315
art_subject             15334
style                   14435
shape                   15342
pattern                 14745
bottom_category_id          0
bottom_category_text        0
top_category_id             0
top_category_text           0
primary_color_id            0
primary_color_text          0
secondary_color_id          0
secondary_color_text        0
image/encoded               0
image/width                 0
image/height                0
dtype: int64

Considering other columns like title,tags and description which are in text we will keep it blank as we cannot impute

We will also drop columns from our feature like room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape, pattern whose missing value rate is extremely high (>50%)

In [None]:
sample_df1.head(4)

Unnamed: 0,index,product_id,title,description,tags,type,room,craft_type,recipient,material,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
0,0,1431873531,"Mum, Mummy or Mammy &#39;In A Million&#39; Pin...",Tell your mum how much she means to you with t...,"new mum gift,gifts for mum,mothers day gift,ts...",physical,,,,,...,clothing.gender_neutral_adult_clothing.pajamas...,5,clothing,11,pink,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,432
1,1,1575820277,Cast Iron Fusion Mineral Paint FMP Self Levell...,"Cast Iron\nSturdy and bold, Cast Iron is a tru...","Cast Iron,FMP,Fusion Mineral Paint,Resin Based...",physical,,,,,...,craft_supplies_and_tools.paints_inks_and_dyes,6,craft_supplies_and_tools,1,black,19,other,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,603
2,2,1297954951,4th Birthday Crown | Fourth Birthday Crown | 4...,● This listing ●\nPASTEL RAINBOW glitter crown...,"birthday hat girl,birthday crown,4th birthday ...",physical,,,,,...,paper_and_party_supplies.party_supplies.party_...,10,paper_and_party_supplies,13,rainbow,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,456
3,3,1655635863,Teddy Bear Valentines Pyjamas,Beautiful Valentine&#39;s Day pyjamas. \n\nTh...,"Pyjamas,boys,gift,ideas,valentines,love,family...",physical,,,,,...,clothing.girls_clothing.pajamas_and_robes.paja...,5,clothing,17,white,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570


Lets do text cleaning for columns (title,description and tags).But first lets convert na records to ''

In [None]:
sample_df1.fillna('',inplace=True)

### Text cleaning

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def special_characters_removal(sentence):
    return re.sub(r'[^A-Za-z0-9\s]', '', sentence)


english_words = set(words.words())

def filter_english_words(sentence):
    return ' '.join(word for word in sentence.split() if word.lower() in english_words)

stop_words = set(stopwords.words('english'))

def remove_stop_words(sentence):
    return ' '.join(word for word in sentence.split() if word.lower() not in stop_words)

def convert_numbers_to_blank(sentence):
    return re.sub(r'\b\d+\b', '', sentence)

lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    tokenized_words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]
    return ' '.join(lemmatized_words)

In [None]:
def text_cleaning(text):
    # \n removal
    text = text.replace(r'\n', ' ')

    # Remove email-address
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLS which start from http
    text = re.sub(r'http\S+|www\S+', '', text)

    #Remove Html tags from text
    text=remove_html_tags(text)

    #Remove special characters
    text=special_characters_removal(text)

    #Convert Numbers to blank
    text=convert_numbers_to_blank(text)

    # Removal Non AsCII Characters
    text = text.encode('ascii', 'ignore').decode('utf-8')

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Lowercasing the text
    text=text.lower()

    #Remove Non ENglish Wprds from sentence
    text=filter_english_words(text)

    #Remove stopwords from sentence
    text=remove_stop_words(text)

    return text

In [None]:
sample_df1['description'] = sample_df1['description'].apply(text_cleaning)
sample_df1['title'] = sample_df1['title'].apply(text_cleaning)
sample_df1['tags'] = sample_df1['tags'].apply(text_cleaning)

  soup = BeautifulSoup(text, 'html.parser')


In [None]:
df_test['description'] = df_test['description'].apply(text_cleaning)
df_test['title'] = df_test['title'].apply(text_cleaning)
df_test['tags'] = df_test['tags'].apply(text_cleaning)

In [None]:
sample_df1.head(10)

Unnamed: 0,index,product_id,title,description,tags,type,room,craft_type,recipient,material,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
0,0,1431873531,mum mummy mammy million pink tartan gift day,tell mum much gorgeous pink tartan option chan...,new mum day day day tartan day mummy,physical,,,,,...,clothing.gender_neutral_adult_clothing.pajamas...,5,clothing,11,pink,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,432
1,1,1575820277,cast iron fusion mineral paint self furniture ...,cast iron sturdy bold cast iron true shade str...,cast mineral based iron cast iron iron iron paint,physical,,,,,...,craft_supplies_and_tools.paints_inks_and_dyes,6,craft_supplies_and_tools,1,black,19,other,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,603
2,2,1297954951,birthday crown fourth birthday crown birthday ...,listing pastel rainbow glitter crown white pom...,birthday hat birthday birthday birthday birthd...,physical,,,,,...,paper_and_party_supplies.party_supplies.party_...,10,paper_and_party_supplies,13,rainbow,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,456
3,3,1655635863,bear,beautiful day top quality cotton name small am...,,physical,,,,,...,clothing.girls_clothing.pajamas_and_robes.paja...,5,clothing,17,white,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
4,4,1131451819,size hand,knitted size color mottled variegated,,physical,,,,,...,clothing.womens_clothing.socks_and_hosiery.hos...,5,clothing,18,yellow,9,green,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,698
5,5,1534917256,silver l purple jade silver l sterling silver ...,solid sterling silver jewelry recycle metal si...,sterling silver day jade gift,physical,,,women,silver,...,weddings.jewelry.cuff_links_and_tie_tacks.cuff...,14,weddings,16,silver,12,purple,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,513
6,6,1245410829,salt water silk paint dixie belle paint compan...,shipping unable offer free shipping however ab...,white silk water mineral owl belle paint,physical,,kids' crafts,,,...,craft_supplies_and_tools.paints_inks_and_dyes....,6,craft_supplies_and_tools,17,white,19,other,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,380
7,7,757189264,pair chapman regency crystal brass lamp glamour,pair heavy crystal brass lamp luxury lighting ...,room,physical,bedroom,,,glass,...,home_and_living.lighting.lamps_shades_and_base...,8,home_and_living,7,gold,5,clear,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,456
8,8,1568248126,framed herbarium flower illustration dried flo...,botanical heart want flowery creepy pleasure c...,,physical,,,,,...,home_and_living.home_decor.flower_arrangements...,8,home_and_living,12,purple,14,red,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,718
9,9,511757925,custom monogram black white car cup holder coa...,custom car cup holder fun way car car make gre...,car cup cup cup car car car,physical,,,women,,...,electronics_and_accessories.car_parts_and_acce...,7,electronics_and_accessories,2,blue,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,1013


In [None]:
sample_df1=sample_df1[['product_id','title','description','tags','type','top_category_id','top_category_text']]

In [None]:
#Saving the dataframe to csv file
sample_df1.to_csv('/kaggle/working/sample_df1_train.csv')

# Data Cleaning - End

## Model Training for Top Category ID - Start

### Multinomial NB Model using K fold cross validation with grid search

In [None]:
label_encoder = LabelEncoder()

# Fit label encoder and transform 'top_category_id' column
sample_df1['top_category_id_encoded'] = label_encoder.fit_transform(sample_df1['top_category_id'])

In [None]:
train_df, test_df = train_test_split(sample_df1, test_size=0.2, random_state=42)

X_train = train_df['title'] + ' ' + train_df['description'] + ' ' + train_df['tags'] + ' ' + train_df['type']
y_train = train_df['top_category_id_encoded']

X_test = test_df['title']+ ' ' + test_df['description'] + ' ' + test_df['tags'] + ' ' + test_df['type']
y_test = test_df['top_category_id_encoded']

X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

In [None]:
# Define k-fold cross-validation
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Create Naive Bayes pipeline
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# nb_pipeline.to(device)

# Define parameter grid for GridSearchCV
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'tfidf__use_idf': (True, False),        # Use IDF or not
    'clf__alpha': [0.1, 0.5, 1.0],          # Smoothing parameter for MultinomialNB
}



# Perform grid search cross-validation
grid_search = GridSearchCV(nb_pipeline, param_grid, cv=k_fold, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best hyperparameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

# Fit the model on the entire training set with best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# best_model.to(device)

joblib.dump(best_model, '/kaggle/working/best_model_top_category_id.pkl')

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy and F1-score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print accuracy and F1-score
print('Accuracy:', accuracy)
print('F1-score:', f1)
print(classification_report(y_test, y_pred))

Best Hyperparameters: {'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Best Score: 0.6722817831311684
Accuracy: 0.6814958091553837
F1-score: 0.6619647424879348
              precision    recall  f1-score   support

           0       0.75      0.41      0.53       184
           1       0.76      0.17      0.28       129
           2       0.80      0.57      0.67        89
           3       0.84      0.50      0.63       152
           4       0.93      0.45      0.61        86
           5       0.71      0.96      0.82       414
           6       0.60      0.72      0.65       438
           7       0.92      0.53      0.67       107
           8       0.61      0.93      0.74       736
           9       0.85      0.52      0.64       110
          10       0.63      0.57      0.60       159
          11       0.95      0.56      0.70        72
          12       0.92      0.70      0.79       109
          13       0.80      0.45      0.57       191
     

### Stochastic Gradient Descent with Grid CV

In [None]:


# Loop through each learning rate

pipeline  = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', random_state=42, max_iter=10, tol=None)),
               ])

param_grid = {
'clf__alpha': [1e-3, 1e-4, 1e-5],
'clf__penalty': ['l2', 'l1'],
'clf__learning_rate': ['optimal', 'adaptive']
}

# Define macro F1-score as the scoring metric
scorer = make_scorer(f1_score, average='macro')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring=scorer)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_


print("Best Parameters:", best_params)
print("Best Score (Macro F1):", best_score)

# Predict on the test set using the best model
y_pred = grid_search.predict(X_test)

# Evaluate the model
f1_macro = f1_score(y_test, y_pred, average='macro')

print("Macro F1-score on Test Set:", f1_macro)

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 892, in fit
    self._more_validate_params()
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 149, in _more_validate_par

Best Parameters: {'clf__alpha': 0.0001, 'clf__learning_rate': 'optimal', 'clf__penalty': 'l2'}
Best Score (Macro F1): 0.7251660932261909
Macro F1-score on Test Set: 0.7360947310092365


### Support Vector Machine

In [None]:
# Preprocessing the text data
vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()
X_train_counts = vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Training the SVM model
svm = SVC(kernel='linear', C=1)
svm.fit(X_train_tfidf, y_train)

joblib.dump(svm, '/kaggle/working/svm_model_top_category_id.pkl')

# Preprocessing the test data
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Predicting on the test data
y_pred = svm.predict(X_test_tfidf)

# Evaluating the model
f1 = f1_score(y_test, y_pred, average='weighted')
print('F1-score:', f1)

F1-score: 0.7441337704002042


In [None]:
f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)

0.7441337704002042


## For Test Dataset

Predict the output using SVM Model

In [None]:
X_test = df_test['title']+ ' ' + df_test['description'] + ' ' + df_test['tags'] + ' ' + df_test['type']

In [None]:
df_test.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern,image/encoded,image/width,image/height,top_category_id
0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"Vintage book! 1968 A First Doll Book, &quot; ...","flip book,vintage,Peggy and Me,change dresses",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,760,4
1,870948450,"Abigayil: The Story of the Cat at the Manger, ...",Rouben Mamoulian was an American film and thea...,"one of a kind,authentic,original,gift,special,...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,760,4
2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","A fun little vintage book “No Fighting, No Bit...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,379,4
3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,Ukrainian book.\n&quot;Magic flashlight&quot;....,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,766,4
4,1035734450,The Ugly Butterfly,PreOrder this imaginative story about the life...,"moth metamorphosis,caterpillar bug,butterfly c...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,719,8


In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Initialize list to store predictions
y_pred = []

# Predicting on the test data
for i in tqdm(range(X_test_tfidf.shape[0]), desc="Predicting"):
    y_pred.append(svm.predict(X_test_tfidf[i]))

# Convert predictions to array
y_pred = np.array(y_pred)

Predicting: 100%|██████████| 25514/25514 [06:10<00:00, 68.95it/s]


In [None]:
df_test.head()

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,holiday,art_subject,style,shape,pattern,image/encoded,image/width,image/height,top_category_id
0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"Vintage book! 1968 A First Doll Book, &quot; ...","flip book,vintage,Peggy and Me,change dresses",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,760,4
1,870948450,"Abigayil: The Story of the Cat at the Manger, ...",Rouben Mamoulian was an American film and thea...,"one of a kind,authentic,original,gift,special,...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,760,4
2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","A fun little vintage book “No Fighting, No Bit...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,379,4
3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,Ukrainian book.\n&quot;Magic flashlight&quot;....,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,766,4
4,1035734450,The Ugly Butterfly,PreOrder this imaginative story about the life...,"moth metamorphosis,caterpillar bug,butterfly c...",physical,,,,,,,,,,,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,719,8


In [None]:
df_test['top_category_id']=y_pred

In [None]:
unique_combinations_dict = unique_combinations.set_index('top_category_id')['top_category_text'].to_dict()

In [None]:
df_test['top_category_text'] = df_test['top_category_id'].map(unique_combinations_dict)

In [None]:
df_test[['product_id','title','tags','type','top_category_id','top_category_text']].to_csv('/kaggle/working/top_category_predictions.csv')

# For Bottom Category ID

## Importing Libraries - Start

In [None]:
import pandas as pd
import numpy as np
import re

### nltk libraries

In [None]:
!pip install nltk
import nltk
from nltk.corpus import words

# Download the 'words' corpus if not already downloaded
nltk.download('words')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup



### Libraries for statistics

In [None]:
from scipy.stats import chi2_contingency
from scipy.stats import ks_2samp

### Libraries for model training

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score,make_scorer
from tqdm import tqdm
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

## Importing Dataset

In [None]:
PATH = f"/kaggle/input/etsy-dataset/data/2024"

In [None]:
import os
parquet_filenames_train = os.listdir("/kaggle/input/etsy-dataset/data/2024/train")
parquet_filenames_test = os.listdir("/kaggle/input/etsy-dataset/data/2024/test")

In [None]:
def read_data(folder, filenames):
    """ Read parquet data from mutiple files """

    dataframes = [
        pd.read_parquet(f"{PATH}/{folder}/{filename}")
        for filename in filenames
    ]

    return pd.concat(dataframes)

In [None]:
%%time

df_train = read_data("train", parquet_filenames_train[:])
df_test = read_data("test", parquet_filenames_test[:])

CPU times: user 23 s, sys: 57.9 s, total: 1min 20s
Wall time: 2min 44s


## Data Exploration

In [None]:
f"Number of rows in train: {len(df_train):,}"

'Number of rows in train: 229,624'

In [None]:
f"Number of unique products in train: {len(df_train['product_id'].unique()):,}"

'Number of unique products in train: 229,624'

In [None]:
f"Number of columns in train: {len(df_train.columns):,}"

'Number of columns in train: 26'

In [None]:
list(df_train.columns)

['product_id',
 'title',
 'description',
 'tags',
 'type',
 'room',
 'craft_type',
 'recipient',
 'material',
 'occasion',
 'holiday',
 'art_subject',
 'style',
 'shape',
 'pattern',
 'bottom_category_id',
 'bottom_category_text',
 'top_category_id',
 'top_category_text',
 'primary_color_id',
 'primary_color_text',
 'secondary_color_id',
 'secondary_color_text',
 'image/encoded',
 'image/width',
 'image/height']

In [None]:
#COPIED
print(f"Number of top categories in train: {len(df_train['top_category_text'].unique()):,}")

Number of top categories in train: 15


In [None]:
df_train.iloc[0]

product_id                                                     1278122912
title                   Dragon and Snake Custom Nike AF1 Custom Chines...
description             Dragon and Snake Custom Nike AF1 Custom Chines...
tags                    handpainted sneaker,sneaker,unique gift,custom...
type                                                             physical
room                                                                     
craft_type                                                               
recipient                                                                
material                                                                 
occasion                                                         birthday
holiday                                                         christmas
art_subject                                                              
style                                                                    
shape                                 

In [None]:
df_train.head(5)

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
0,1278122912,Dragon and Snake Custom Nike AF1 Custom Chines...,Dragon and Snake Custom Nike AF1 Custom Chines...,"handpainted sneaker,sneaker,unique gift,custom...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,1,black,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,596
1,698470598,"Canvas Shoes Trainers, Kids to Adults, Hand C...","Fantastic Gold Edged Butterfly Fabric, yellow ...","Butterflies,Butterfly shoes,Butterfly presents...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,18,yellow,13,rainbow,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,427
2,1437216455,Sneaker Keychain,Ideal gift for exclusive sneaker lovers\n\nThe...,,physical,,,,,anniversary,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,14,red,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,380
3,1502633808,Adidas Superstar Custom Sneakers Rose Shoes Bl...,Adidas Superstar Custom Sneakers Rose\n\n· Pai...,"Superstar Custom,Air Force 1 Rose,Rose Custom ...",physical,,,,,birthday,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,17,white,1,black,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
4,1474730171,"Madras Shoes for Men, Preppy Madras Patchwork ...",Looking for a classic madras shoe for men? Her...,"madras shoes,men's madras shoes,lace-up canvas...",physical,,,,,,...,shoes.gender_neutral_adult_shoes.sneakers_and_...,12,shoes,2,blue,9,green,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570


## Sampling the dataset

In [None]:
df_train['bottom_category_id'].value_counts(normalize=True) * 100

bottom_category_id
2070     0.042678
2371     0.042678
1046     0.042678
12186    0.042678
46       0.042678
           ...   
413      0.019162
11220    0.019162
12356    0.018726
6796     0.018291
2101     0.018291
Name: proportion, Length: 2609, dtype: float64

Now since we have percentage distribution lets do sampling for sample data length calculation Using the link https://www.calculator.net/sample-size-calculator.html received the sample size as 15517

![image.png](attachment:b1b7f223-1e91-4d1c-9f13-a50e7f235e7c.png)

In [None]:
sample_size=15517

In [None]:
df_train['bottom_category_id'].value_counts().min()

42

In [None]:
samples_per_category = (df_train.groupby('bottom_category_id').size() / len(df_train) * sample_size).astype(int)
def stratified_sampling(group):
    return group.sample(samples_per_category[group.name])

sample_df2 = df_train.groupby('bottom_category_id', group_keys=False).apply(stratified_sampling)
sample_df2 = sample_df2.sample(frac=1).reset_index(drop=True)
sample_df2.reset_index(inplace=True)

  sample_df2 = df_train.groupby('bottom_category_id', group_keys=False).apply(stratified_sampling)


In [None]:
sample_df2['bottom_category_id'].value_counts().min()

2

Since after sampling we are getting only 2 value in some categories this wont be beneficial for us.So we are considering whole dataset


## Data Cleaning

### Replacing all blank values to nan in order to get na count

In [None]:
df_train = df_train.replace('', np.nan)

In [None]:
df_train.isna().sum()

product_id                   0
title                      810
description                810
tags                     31169
type                       348
room                    220686
craft_type              201862
recipient               218815
material                209384
occasion                171550
holiday                 183359
art_subject             226885
style                   213730
shape                   227137
pattern                 218042
bottom_category_id           0
bottom_category_text         0
top_category_id              0
top_category_text            0
primary_color_id             0
primary_color_text           0
secondary_color_id           0
secondary_color_text         0
image/encoded                0
image/width                  0
image/height                 0
dtype: int64

In [None]:
df_train.fillna('', inplace=True)

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [None]:
def special_characters_removal(sentence):
    return re.sub(r'[^A-Za-z0-9\s]', '', sentence)

In [None]:
english_words = set(words.words())

In [None]:
def filter_english_words(sentence):
    return ' '.join(word for word in sentence.split() if word.lower() in english_words)

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stop_words(sentence):
    return ' '.join(word for word in sentence.split() if word.lower() not in stop_words)

In [None]:
def convert_numbers_to_blank(sentence):
    return re.sub(r'\b\d+\b', '', sentence)

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_sentence(sentence):
    tokenized_words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]
    return ' '.join(lemmatized_words)

### Text cleaning

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def text_cleaning(text):
    # \n removal
    text = text.replace(r'\n', ' ')

    # Remove email-address
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLS which start from http
    text = re.sub(r'http\S+|www\S+', '', text)

    #Remove Html tags from text
    text=remove_html_tags(text)

    #Remove special characters
    text=special_characters_removal(text)

    #Convert Numbers to blank
    text=convert_numbers_to_blank(text)

    # Removal Non AsCII Characters
    text = text.encode('ascii', 'ignore').decode('utf-8')

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Lowercasing the text
    text=text.lower()

    #Remove Non ENglish Wprds from sentence
    text=filter_english_words(text)

    #Remove stopwords from sentence
    text=remove_stop_words(text)

    return text

#calling the clean description function on the TRAIN dataset


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_train['description'] = df_train['description'].apply(text_cleaning)
df_train['title'] = df_train['title'].apply(text_cleaning)
df_train['tags'] = df_train['tags'].apply(text_cleaning)

  soup = BeautifulSoup(text, 'html.parser')


In [None]:
df_test['description'] = df_test['description'].apply(text_cleaning)
df_test['title'] = df_test['title'].apply(text_cleaning)
df_test['tags'] = df_test['tags'].apply(text_cleaning)

  soup = BeautifulSoup(text, 'html.parser')


Checking for type column

In [None]:
df_train[df_train['type']=='']

Unnamed: 0,product_id,title,description,tags,type,room,craft_type,recipient,material,occasion,...,bottom_category_text,top_category_id,top_category_text,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height
1121,543929139,train beaded collar,g skein separable pictured size cotton yarn li...,,,,,,,,...,craft_supplies_and_tools.fabric_and_notions.no...,6,craft_supplies_and_tools,0,beige,0,beige,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,428
829,1310295352,peace end violence together application x,embroidery patch polyester robust polyester tw...,,,,,,,birthday,...,accessories,0,accessories,1,black,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
852,1324330285,veteran patch military badge war patch rocker ...,embroidery patch polyester robust polyester tw...,,,,,,,birthday,...,accessories,0,accessories,1,black,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
864,1310285992,love patch live iron patch motivation applicat...,embroidery patch polyester robust polyester tw...,,,,,,,birthday,...,accessories,0,accessories,1,black,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
866,1310314594,patch team bride patch x,embroidery patch made polyester robust polyest...,,,,,,,birthday,...,accessories,0,accessories,1,black,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,553360575,princess mold sugar paste,large multitude food nonfood sugar paste almon...,,,,,,,,...,craft_supplies_and_tools.kitchen_supplies,6,craft_supplies_and_tools,2,blue,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
640,924793898,day tablecloth rosy pink hearts hearts love bl...,set table spoonflower tablecloth dress everyth...,spoonflower table dining tablecloth,,,,,cotton,,...,home_and_living.kitchen_and_dining.linens.tabl...,8,home_and_living,17,white,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
669,1043427245,rainbow tablecloth pop green garden botanical ...,set table spoonflower tablecloth dress everyth...,spoonflower table dining summer,,,,,cotton,,...,home_and_living.kitchen_and_dining.linens.tabl...,8,home_and_living,9,green,10,orange,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570
693,984318133,blush pink tablecloth rococo romantic baroque ...,set table spoonflower tablecloth dress everyth...,spoonflower table dining pink ornate,,,,,cotton,,...,home_and_living.kitchen_and_dining.linens.tabl...,8,home_and_living,11,pink,2,blue,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570


In [None]:
df_train['type'].iloc[6676]

''

In [None]:
df_train['type'].iloc[6675]

'physical'

In [None]:
df_train['type'].iloc[6677]

'physical'

In [None]:
df_train['type'].iloc[6674]

''

In [None]:
df_train['type'].iloc[6673]

'physical'

In [None]:
df_train['type'].iloc[6675]

'physical'

As we observed for missing values in type the adjecant rows values are same. So we are imputing values based on forward fill


### Imputing TYPE column for missing values.

In [None]:
df_train['type'] = df_train['type'].replace('', method='ffill')

  df_train['type'] = df_train['type'].replace('', method='ffill')


In [None]:
df_test['type'] = df_test['type'].replace('', method='ffill')

  df_test['type'] = df_test['type'].replace('', method='ffill')


## Model Training

### Predicting Bottom Category ID

### Multinomial NB Model for predicting bottom category id

In [None]:

import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, f1_score

train_df, test_df = train_test_split(df_train, test_size=0.2, random_state=42)

X_train = train_df['title'] + ' ' + train_df['tags'] + ' ' + train_df['type'] + ' ' + train_df['top_category_text']
y_train = train_df['bottom_category_id']

X_test = test_df['title'] + ' ' + test_df['tags'] + ' ' + test_df['type'] + ' ' + test_df['top_category_text']
y_test = test_df['bottom_category_id']

X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

nb_pipeline_bottom = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

nb_pipeline_bottom.fit(X_train, y_train)

joblib.dump(nb_pipeline_bottom, '/kaggle/working/nb_pipeline_bottom_model.pkl')

y_pred = nb_pipeline_bottom.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1-score:', f1)
print(classification_report(y_test, y_pred))

Accuracy: 0.48899292324442023
F1-score: 0.4620811819089589
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        18
           2       0.71      0.22      0.33        23
           3       0.61      1.00      0.76        11
           4       0.50      0.67      0.57        15
           5       0.45      1.00      0.62        14
           6       0.00      0.00      0.00        22
           7       0.67      0.74      0.70        19
           8       0.47      0.56      0.51        16
           9       1.00      0.17      0.30        23
          10       0.47      0.38      0.42        21
          11       0.81      0.81      0.81        16
          12       0.60      0.53      0.56        17
          13       0.50      0.76      0.60        17
          14       0.56      0.82      0.67        17
          15       0.38      0.62      0.48        16
          16       0.33      0.31      0.32        16
          17       0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Stochastic Gradient Boost

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Split data into training and testing sets
train_df, test_df = train_test_split(df_train, test_size=0.3, random_state=42)

# Create feature matrix X and target variable y for training set
train_X = train_df['title'] + ' ' + train_df['tags'] + ' ' + train_df['type'] + ' ' + train_df['top_category_text']
train_y = train_df['bottom_category_id']

test_X = test_df['title'] + ' ' + test_df['tags'] + ' ' + test_df['type'] + ' ' + test_df['top_category_text']
test_y = test_df['bottom_category_id']


train_X = np.asarray(train_X)
test_X = np.asarray(test_X)

#Experimenting with learning rates
learning_rates = [1e-3, 1e-4, 1e-5]


results = {}

# Loop through each learning rate
for lr in learning_rates:
    print(f'for lr : {lr}')
    sgd = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=lr, random_state=42, max_iter=10, tol=None)),
                   ])
    sgd.fit(train_X, train_y)

    y_pred_sgd_top = sgd.predict(X_test)

    # Calculate accuracy and F1-score
    accuracy = accuracy_score(y_pred_sgd_top, test_y)
    f1 = f1_score(test_y, y_pred_sgd_top, average='weighted')

    # Store the results in the dictionary
    results[lr] = {'accuracy': accuracy, 'f1_score': f1}

# Print the results for each learning rate
for lr, scores in results.items():
    print('Learning Rate: {}'.format(lr))
    print('Accuracy: {}'.format(scores['accuracy']))
    print('F1-score: {}'.format(scores['f1_score']))
    print('----------------------------')

for lr : 0.001
for lr : 0.0001
for lr : 1e-05
Learning Rate: 0.001
Accuracy: 0.46838346301242595
F1-score: 0.4487024954767158
----------------------------
Learning Rate: 0.0001
Accuracy: 0.4675270003483916
F1-score: 0.45097097313734774
----------------------------
Learning Rate: 1e-05
Accuracy: 0.5000870978980374
F1-score: 0.48021125240604995
----------------------------


## Working on Test Data

Importing Multinomial NB Model

Importing test dataset we got from top category id predictions

In [None]:
test_df=pd.read_csv('/kaggle/input/top-category-predictions/top_category_predictions (1).csv')

In [None]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,product_id,title,tags,type,top_category_id,top_category_text
0,0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"flip book,vintage,Peggy and Me,change dresses",physical,4,books_movies_and_music
1,1,870948450,"Abigayil: The Story of the Cat at the Manger, ...","one of a kind,authentic,original,gift,special,...",physical,4,books_movies_and_music
2,2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,4,books_movies_and_music
3,3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,4,books_movies_and_music
4,4,1035734450,The Ugly Butterfly,"moth metamorphosis,caterpillar bug,butterfly c...",physical,8,home_and_living


In [None]:
test_df.fillna('',inplace=True)

In [None]:
X_test = test_df['title'] + ' ' + test_df['tags'] + ' ' + test_df['type'] + ' ' + test_df['top_category_text']

X_test = np.asarray(X_test)

In [None]:
model = joblib.load("/kaggle/input/nb-pipeline-bottom-model/nb_pipeline_bottom_model.pkl")

In [None]:
y_pred=model.predict(X_test)

In [None]:
test_df['bottom_category_id']=y_pred

In [None]:
unique_combinations = df_train[['bottom_category_id', 'bottom_category_text']].drop_duplicates()

In [None]:
unique_combinations_dict = unique_combinations.set_index('bottom_category_id')['bottom_category_text'].to_dict()

In [None]:
test_df['bottom_category_text'] = test_df['bottom_category_id'].map(unique_combinations_dict)

In [None]:
test_df[['product_id','title','tags','type','top_category_id','top_category_text','bottom_category_id','bottom_category_text']].to_csv('/kaggle/working/bottom_category_predictions.csv')

# For Color Predictions

## Importing Libraries

In [None]:
!pip install rembg

Collecting rembg
  Downloading rembg-2.0.56-py3-none-any.whl.metadata (15 kB)
Collecting onnxruntime (from rembg)
  Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pymatting (from rembg)
  Downloading PyMatting-1.1.12-py3-none-any.whl.metadata (7.4 kB)
Collecting coloredlogs (from onnxruntime->rembg)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->rembg)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading rembg-2.0.56-py3-none-any.whl (32 kB)
Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading PyMatting-1.1.12-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53

In [None]:
import pandas as pd
import numpy as np
import re
import torch
import tensorflow as tf
import os
import io
from rembg import remove
from tqdm import tqdm
import ast

2024-04-18 21:18:26.022354: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 21:18:26.022498: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 21:18:26.176512: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### nltk libraries

In [None]:
!pip install nltk
import nltk
from nltk.corpus import words

# Download the 'words' corpus if not already downloaded
nltk.download('words')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

  pid, fd = os.forkpty()


[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup

  pid, fd = os.forkpty()




### Libraries for statistics

In [None]:
from scipy.stats import chi2_contingency
from scipy.stats import ks_2samp

In [None]:
from sklearn.cluster import KMeans

### Image Libraries

In [None]:
!pip install ultralytics
from ultralytics import YOLO
from PIL import Image

Collecting ultralytics
  Downloading ultralytics-8.2.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading ultralytics-8.2.1-py3-none-any.whl (750 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.8/750.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.2.1


### Libraries for model training

## Importing Dataset

In [None]:
PATH = f"/kaggle/input/etsy-dataset/data/2024"

In [None]:
parquet_filenames_train = os.listdir("/kaggle/input/etsy-dataset/data/2024/train")
parquet_filenames_test = os.listdir("/kaggle/input/etsy-dataset/data/2024/test")

In [None]:
def read_data(folder, filenames):
    """ Read parquet data from mutiple files """

    dataframes = [
        pd.read_parquet(f"{PATH}/{folder}/{filename}")
        for filename in filenames
    ]

    return pd.concat(dataframes)

In [None]:
%%time

df_train = read_data("train", parquet_filenames_train[:])
#df_test = read_data("test", parquet_filenames_test[:])

CPU times: user 21.1 s, sys: 53.9 s, total: 1min 15s
Wall time: 3min 59s


In [None]:
df_test = read_data("test", parquet_filenames_test[:])

# Sampling the Dataset For top Category ID

Since the dataset is huge it will impact the computation and resources. In order to train the dataset efficiently we will sample the dataset accordingly. Let check the percentage distribution or primary_color_id

In [None]:
df_train['primary_color_id'].value_counts()

primary_color_id
1     30673
17    28609
2     27649
4     19867
11    16029
14    15835
9     15578
0     12366
16    10758
7      9569
19     8985
18     7231
12     6750
10     5625
5      4546
13     4404
3      2471
6      1852
15      827
Name: count, dtype: int64

In [None]:
df_train['primary_color_id'].value_counts(normalize=True) * 100

primary_color_id
1     13.357924
17    12.459064
2     12.040989
4      8.651970
11     6.980542
14     6.896056
9      6.784134
0      5.385326
16     4.685050
7      4.167247
19     3.912919
18     3.149061
12     2.939588
10     2.449657
5      1.979758
13     1.917918
3      1.076107
6      0.806536
15     0.360154
Name: proportion, dtype: float64

Now since we have percentage distribution lets do sampling for sample data length calculation Using the link https://www.calculator.net/sample-size-calculator.html received the sample size as 15517

![image.png](attachment:65737022-850f-47b6-b6e9-f5e7d490df7a.png)

Using Stratified sampling

In [None]:
sample_size=15517

In [None]:
primary_color_percentage = df_train['primary_color_id'].value_counts(normalize=True)
secondary_color_percentage = df_train['secondary_color_id'].value_counts(normalize=True)

# Perform stratified sampling for primary_color_id
sampled_primary_color = df_train.groupby('primary_color_id').apply(lambda x: x.sample(int(sample_size * primary_color_percentage[x.name])))

# Perform stratified sampling for secondary_color_id
sampled_secondary_color = df_train.groupby('secondary_color_id').apply(lambda x: x.sample(int(sample_size * secondary_color_percentage[x.name])))

# Concatenate the sampled DataFrames
sampled_df = pd.concat([sampled_primary_color, sampled_secondary_color])

# Shuffle the DataFrame to ensure randomness
sampled_df = sampled_df.sample(frac=1).reset_index(drop=True)

  sampled_primary_color = df_train.groupby('primary_color_id').apply(lambda x: x.sample(int(sample_size * primary_color_percentage[x.name])))
  sampled_secondary_color = df_train.groupby('secondary_color_id').apply(lambda x: x.sample(int(sample_size * secondary_color_percentage[x.name])))


In [None]:
len(sampled_df)

31014

In [None]:
len(sampled_df[sampled_df.duplicated(subset=['product_id'], keep=False)])

2148

In [None]:
sampled_df = sampled_df.drop_duplicates(subset=['product_id'])

Now lets check statistically if sample df is equal to df_train using chi square test

In [None]:
original_counts = df_train['primary_color_id'].value_counts()
sample_counts = sampled_df['primary_color_id'].value_counts()

contingency_table = pd.concat([original_counts, sample_counts], axis=1, keys=['Original', 'Sample']).fillna(0)

chi2, p_value, _, _ = chi2_contingency(contingency_table)

print("Chi-square statistic:", chi2)
print("P-value:", p_value)

Chi-square statistic: 7.021259710426978
P-value: 0.9899458381791132


In [None]:
original_counts = df_train['secondary_color_id'].value_counts()
sample_counts = sampled_df['secondary_color_id'].value_counts()

contingency_table = pd.concat([original_counts, sample_counts], axis=1, keys=['Original', 'Sample']).fillna(0)

chi2, p_value, _, _ = chi2_contingency(contingency_table)

print("Chi-square statistic:", chi2)
print("P-value:", p_value)

Chi-square statistic: 5.751657483256019
P-value: 0.9971037064345525


Checking Kolmogorov-Smirnov Test as well

In [None]:
ecdf_original = df_train['primary_color_id'].value_counts(normalize=True).sort_index().cumsum()
ecdf_sample = sampled_df['primary_color_id'].value_counts(normalize=True).sort_index().cumsum()

# Perform Kolmogorov-Smirnov Test
ks_statistic, p_value = ks_2samp(ecdf_original, ecdf_sample)

print("Kolmogorov-Smirnov Test Statistic:", ks_statistic)
print("P-value:", p_value)

Kolmogorov-Smirnov Test Statistic: 0.05263157894736842
P-value: 1.0


In [None]:
ecdf_original = df_train['secondary_color_id'].value_counts(normalize=True).sort_index().cumsum()
ecdf_sample = sampled_df['secondary_color_id'].value_counts(normalize=True).sort_index().cumsum()

# Perform Kolmogorov-Smirnov Test
ks_statistic, p_value = ks_2samp(ecdf_original, ecdf_sample)

print("Kolmogorov-Smirnov Test Statistic:", ks_statistic)
print("P-value:", p_value)

Kolmogorov-Smirnov Test Statistic: 0.05263157894736842
P-value: 1.0


Now since the p-value is 1 we will accept the null hypothesis that is sample and population dataframes are the same

## Image Processing - Start

### YOLO using GPU

In [None]:
model = YOLO("yolov8s.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:00<00:00, 164MB/s]


In [None]:
def calculate_area(coordinates):
    xmin, ymin, xmax, ymax = coordinates
    return (xmax - xmin) * (ymax - ymin)

In [None]:
def change_width_height(image):
    # Open the image

    new_width = 224
    new_height = 224


    # Resize the image
    resized_image = image.resize((new_width, new_height))

    return resized_image

In [None]:
#Optimized code
def get_bounding_box(img_bytes):
    bytes_io = io.BytesIO(img_bytes)
    image = Image.open(bytes_io)
    image = change_width_height(image)
    results = model.predict(image,verbose=False)
    result_1 = [box for box in results[0].boxes if box.conf[0].item() >= 0.5]

    if not result_1:
        return image

    if len(result_1) > 1:
        largest_bbox = max(result_1, key=lambda bbox: calculate_area(bbox.xyxy[0].tolist()))
    else:
        largest_bbox = result_1[0]

    bbox_coords = [round(coord) for coord in largest_bbox.xyxy[0].tolist()]
    if bbox_coords:
        image = image.crop(bbox_coords)

    #buffered = io.BytesIO()
    #image.save(buffered, format="JPEG")  # Change format if your image is in a different format
    #image_bytes = buffered.getvalue()


    return image

In [None]:
import warnings
def extract_color_with_bg(rgb_image_blue):
  w, h, d = tuple(rgb_image_blue.shape)

  pixel = np.reshape(rgb_image_blue, (w * h, d))

  %matplotlib inline

  n_colors = 2

  with warnings.catch_warnings():
    # Suppress all warnings
    warnings.filterwarnings("ignore")
    model = KMeans(n_clusters=n_colors, random_state=42).fit(pixel)
  # Create a KMeans model with the specified number of clusters and fit it to the pixels
  #model = KMeans(n_clusters=n_colors, random_state=42).fit(pixel)

  # Get the cluster centers (representing colors) from the model
  colour_palette_with_blue = np.uint8(model.cluster_centers_)
  return colour_palette_with_blue
  # # Display the color palette as an image
  # plt.imshow([colour_palette_with_blue])

  # # Show the plot
  # plt.show()

In [None]:
strategy=tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


2024-04-18 09:12:48.331934: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332049: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332121: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332190: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332276: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332486: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this stream.
2024-04-18 09:12:48.332579: E external/local_xla/xla/stream_executor/stream_executor_internal.h:177] SetPriority unimplemented for this 

In [None]:
 def process_image(img_bytes):
    #img_bytes = df_train_sample["image/encoded"][index]
    #bytes_io = io.BytesIO(img_bytes)
    #image1 = Image.open(bytes_io)
    #image=image1
    #with strategy.scope():
    image = get_bounding_box(img_bytes)  # Newly added
    #image = remove_extra_background(image)  # Newly added
    #image_red = remove(image, bgcolor=background_to_add_red)
    #image_blue = remove(image, bgcolor=background_to_add_blue)

    numpy_image_red = np.array(image)

    image_with_blue_bg = extract_color_with_bg(numpy_image_red)

    #similar_colors = find_similar_colors(image_with_blue_bg, image_with_red_bg)
    # similar_colors = calculate_euclidean_distance(image_with_blue_bg, image_with_red_bg)
    #rgb_colors_for_image_final = distance_between_color(similar_colors)
    rgb_colors_for_image_final=image_with_blue_bg
    return str(rgb_colors_for_image_final[0].tolist()),str(rgb_colors_for_image_final[1].tolist())

In [None]:
device = "0" if torch.cuda.is_available() else "cpu"
if device == "0":
    print(device)
    torch.cuda.set_device(0)

In [None]:
j=1
df_train_sample=sampled_df[20000*j:20000*(j+1)]

In [None]:
batch_size = 5000

In [None]:
output_directory = "/kaggle/working/batch_csv_files"
os.makedirs(output_directory, exist_ok=True)

In [None]:
for i in range(0, len(df_train_sample), batch_size):
    # Select the current batch
    batch = df_train_sample.iloc[i:i+batch_size]
    print(i)
    print(i+batch_size)

    tqdm.pandas(desc="Resizing Images", position=0)
    batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))
    #tqdm_notebook.pandas(desc="Resizing Images", position=0)
    #batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))
    # Generate a unique filename for the current batch
    filename = os.path.join(output_directory, f"batch_{j}_{i//batch_size}.csv")

    # Export the current batch to a CSV file
    batch[['product_id','primary_color_rgb','secondary_color_rgb','primary_color_id','primary_color_text','secondary_color_id','secondary_color_text']].to_csv(filename)
    #batch.to_csv(filename, index=False)

    print(f"Batch {i//batch_size} exported to {filename}")

0
5000


Resizing Images: 100%|██████████| 5000/5000 [19:34<00:00,  4.26it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))


Batch 0 exported to /kaggle/working/batch_csv_files/batch_1_0.csv
5000
10000


Resizing Images: 100%|██████████| 4940/4940 [19:18<00:00,  4.26it/s]

Batch 1 exported to /kaggle/working/batch_csv_files/batch_1_1.csv



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['primary_color_rgb'], batch['secondary_color_rgb'] = zip(*batch['image/encoded'].progress_apply(process_image))


## Importing Image Processed data

In [None]:
PATH = f"/kaggle/input/sampled-processed-data/Sampled_Processed_Data"

In [None]:
processed_images_train = os.listdir("/kaggle/input/sampled-processed-data/Sampled_Processed_Data/train")
processed_images_test = os.listdir("/kaggle/input/sampled-processed-data/Sampled_Processed_Data/test")

In [None]:
def read_processed_data(folder):
    dataframe_processed =pd.DataFrame()
    dataframes_list = []
    if(folder=='test'):
        endindex_p=2
    else:
        endindex_p=2
    """ Read processed batch data from mutiple files """
    for j in range(0,endindex_p):
        if(j==1):
            endindex=2
        else:
            endindex=4
        for i in range(0,endindex):
            try:
                #print(f'{PATH}/{folder}/batch_{j}_{i}.csv')
                dataframes = pd.read_pickle(f'{PATH}/{folder}/batch_{j}_{i}.csv')
            except:
                dataframes = pd.read_csv(f'{PATH}/{folder}/batch_{j}_{i}.csv')
            finally:
               dataframes_list.append(dataframes)

    dataframe_processed = pd.concat(dataframes_list)
    return dataframe_processed

In [None]:
df_train_processed=read_processed_data('train')

In [None]:
df_test_processed=read_processed_data('test')

In [None]:
len(df_test_processed)

25514

In [None]:
df_train_processed.head()

Unnamed: 0.1,Unnamed: 0,product_id,primary_color_rgb,secondary_color_rgb,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text
0,0,1171086357,"[157, 146, 142]","[109, 98, 95]",4,brown,9,green
1,1,98905015,"[38, 38, 39]","[198, 190, 189]",11,pink,11,pink
2,2,1524183478,"[207, 202, 193]","[138, 98, 45]",4,brown,17,white
3,3,1086729662,"[143, 97, 70]","[219, 199, 180]",4,brown,6,copper
4,4,1645635058,"[191, 185, 170]","[87, 69, 54]",17,white,2,blue


In [None]:
len(df_train_processed[df_train_processed.duplicated(keep=False)])

0

### Convert RGB Color Codes to single value

In [None]:
def calculate_rgb_value(string_list):
    array = ast.literal_eval(string_list)
    return 256 * 256 * array[0] + 256 * array[1] + array[2]

In [None]:
def calculate_average_rgb_value(string_list):
    array = ast.literal_eval(string_list)
    averaged_values = np.mean(array)
    return averaged_values

In [None]:
def calculate_weighted_average_rgb_value(string_list):
    array = ast.literal_eval(string_list)
    weighted_average = np.dot(array, [0.2989, 0.5870, 0.1140])
    return weighted_average

In [None]:
df_train_processed['primary_color_rgb_value'] = df_train_processed['primary_color_rgb'].apply(calculate_rgb_value)
df_train_processed['secondary_color_rgb_value'] = df_train_processed['secondary_color_rgb'].apply(calculate_rgb_value)

df_train_processed['primary_color_rgb_value_average'] = df_train_processed['primary_color_rgb'].apply(calculate_average_rgb_value)
df_train_processed['secondary_color_rgb_value_average'] = df_train_processed['secondary_color_rgb'].apply(calculate_average_rgb_value)

df_train_processed['primary_color_rgb_value_weighted'] = df_train_processed['primary_color_rgb'].apply(calculate_weighted_average_rgb_value)
df_train_processed['secondary_color_rgb_value_weighted'] = df_train_processed['secondary_color_rgb'].apply(calculate_weighted_average_rgb_value)

In [None]:
df_test_processed['primary_color_rgb_value'] = df_test_processed['primary_color_rgb'].apply(calculate_rgb_value)
df_test_processed['secondary_color_rgb_value'] = df_test_processed['secondary_color_rgb'].apply(calculate_rgb_value)

df_test_processed['primary_color_rgb_value_average'] = df_test_processed['primary_color_rgb'].apply(calculate_average_rgb_value)
df_test_processed['secondary_color_rgb_value_average'] = df_test_processed['secondary_color_rgb'].apply(calculate_average_rgb_value)

df_test_processed['primary_color_rgb_value_weighted'] = df_test_processed['primary_color_rgb'].apply(calculate_weighted_average_rgb_value)
df_test_processed['secondary_color_rgb_value_weighted'] = df_test_processed['secondary_color_rgb'].apply(calculate_weighted_average_rgb_value)

In [None]:
len(df_test)

25514

In [None]:
df_train_processed.head()

Unnamed: 0.1,Unnamed: 0,product_id,primary_color_rgb,secondary_color_rgb,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,primary_color_rgb_value,secondary_color_rgb_value,primary_color_rgb_value_average,secondary_color_rgb_value_average,primary_color_rgb_value_weighted,secondary_color_rgb_value_weighted
0,0,1171086357,"[157, 146, 142]","[109, 98, 95]",4,brown,9,green,10326670,7168607,148.333333,100.666667,148.8173,100.9361
1,1,98905015,"[38, 38, 39]","[198, 190, 189]",11,pink,11,pink,2500135,13024957,38.333333,192.333333,38.1102,192.2582
2,2,1524183478,"[207, 202, 193]","[138, 98, 45]",4,brown,17,white,13617857,9069101,200.666667,93.666667,202.4483,103.9042
3,3,1086729662,"[143, 97, 70]","[219, 199, 180]",4,brown,6,copper,9396550,14403508,103.333333,199.333333,107.6617,202.7921
4,4,1645635058,"[191, 185, 170]","[87, 69, 54]",17,white,2,blue,12564906,5719350,182.0,70.0,185.0649,72.6633


### Get feature for colors in text columns like title description and tags

In [None]:
merged_df = pd.merge(df_train_processed, df_train, on='product_id', how='left')

In [None]:
merged_df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
product_id,1171086357,98905015,1524183478,1086729662,1645635058
primary_color_rgb,"[157, 146, 142]","[38, 38, 39]","[207, 202, 193]","[143, 97, 70]","[191, 185, 170]"
secondary_color_rgb,"[109, 98, 95]","[198, 190, 189]","[138, 98, 45]","[219, 199, 180]","[87, 69, 54]"
primary_color_id_x,4,11,4,4,17
primary_color_text_x,brown,pink,brown,brown,white
secondary_color_id_x,9,11,17,6,2
secondary_color_text_x,green,pink,white,copper,blue
primary_color_rgb_value,10326670,2500135,13617857,9396550,12564906
secondary_color_rgb_value,7168607,13024957,9069101,14403508,5719350


In [None]:
unique_colors=merged_df['primary_color_text_x'].unique().tolist()
unique_colors = [word for word in unique_colors if word != 'other']

In [None]:
unique_colors

['brown',
 'pink',
 'white',
 'silver',
 'bronze',
 'green',
 'beige',
 'blue',
 'black',
 'gold',
 'yellow',
 'red',
 'clear',
 'purple',
 'copper',
 'orange',
 'rainbow',
 'rose gold']

In [None]:
def find_matching_words(text):
    word_array=unique_colors
    matching_words = []
    # Split the text into words
    words = text.split()
    # Iterate through each word in the text
    for word in words:
        # Check if the word is in the word array
        if word in word_array:
            matching_words.append(word)
    return matching_words

In [None]:
merged_df['text']=merged_df['title']+' '+ merged_df['description'] +merged_df['tags']
merged_df['colors']=merged_df['text'].apply(find_matching_words)

In [None]:
df_test_bottom=pd.read_csv('/kaggle/input/bottom-category-prediction/bottom_category_predictions.csv')

In [None]:
merged_df_test = pd.merge(df_test_bottom, df_test[['product_id', 'description','image/encoded']], on='product_id', how='left')

In [None]:
merged_df_test.head()

Unnamed: 0.1,Unnamed: 0,product_id,title,tags,type,top_category_id,top_category_text,bottom_category_id,bottom_category_text,description,image/encoded
0,0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"flip book,vintage,Peggy and Me,change dresses",physical,4,books_movies_and_music,2010,books_movies_and_music.books.literature_and_fi...,"Vintage book! 1968 A First Doll Book, &quot; ...",b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1,1,870948450,"Abigayil: The Story of the Cat at the Manger, ...","one of a kind,authentic,original,gift,special,...",physical,4,books_movies_and_music,345,books_movies_and_music.books.history_books,Rouben Mamoulian was an American film and thea...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2,2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,4,books_movies_and_music,329,books_movies_and_music.books.book_accessories....,"A fun little vintage book “No Fighting, No Bit...",b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3,3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,4,books_movies_and_music,11368,books_movies_and_music.books.childrens_books.b...,Ukrainian book.\n&quot;Magic flashlight&quot;....,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4,4,1035734450,The Ugly Butterfly,"moth metamorphosis,caterpillar bug,butterfly c...",physical,8,home_and_living,352,books_movies_and_music.books.science_and_math_...,PreOrder this imaginative story about the life...,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...


In [None]:
merged_df_test.fillna('',inplace=True)

In [None]:
df_test_processed.head()

Unnamed: 0.1,primary_color_rgb,secondary_color_rgb,product_id,Unnamed: 0,primary_color_rgb_value,secondary_color_rgb_value,primary_color_rgb_value_average,secondary_color_rgb_value_average,primary_color_rgb_value_weighted,secondary_color_rgb_value_weighted
0,"[70, 37, 36]","[171, 146, 91]",815216520,,4597028,11244123,47.666667,136.0,46.746,147.1879
1,"[182, 177, 153]","[98, 80, 62]",870948450,,11973017,6443070,170.666667,80.0,175.7408,83.3202
2,"[254, 254, 254]","[146, 142, 133]",1111113690,,16711422,9604741,254.0,140.333333,253.9746,142.1554
3,"[204, 209, 208]","[81, 127, 117]",1510466600,,13423056,5341045,207.0,108.333333,207.3706,112.0979
4,"[120, 129, 98]","[78, 72, 42]",1035734450,,7897442,5130282,115.666667,64.0,122.763,70.3662


In [None]:
merged_df_test['text']=merged_df_test['title']+' '+ merged_df_test['description'] +merged_df_test['tags']
merged_df_test['colors']=merged_df_test['text'].apply(find_matching_words)

In [None]:
merged_df_test=pd.merge(merged_df_test,df_test_processed[['product_id','primary_color_rgb_value', 'secondary_color_rgb_value','primary_color_rgb_value_average','secondary_color_rgb_value_average','primary_color_rgb_value_weighted','secondary_color_rgb_value_weighted']],on='product_id',how='left')

In [None]:
merged_df_test.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
product_id,815216520,870948450,1111113690,1510466600,1035734450
title,A First Doll Book. &quot; Peggy and Me&quot;!,"Abigayil: The Story of the Cat at the Manger, ...","No Fighting, No Biting! by Else Holmelund Mina...",Ukrainian book. &quot;Magic flashlight&quot;. ...,The Ugly Butterfly
tags,"flip book,vintage,Peggy and Me,change dresses","one of a kind,authentic,original,gift,special,...","Else Minarik,Maurice Sendak,Vintage Children,L...","Books,Ukrainian,New,Gift,Colorful,Best,Stories...","moth metamorphosis,caterpillar bug,butterfly c..."
type,physical,physical,physical,physical,physical
top_category_id,4,4,4,4,8
top_category_text,books_movies_and_music,books_movies_and_music,books_movies_and_music,books_movies_and_music,home_and_living
bottom_category_id,2010,345,329,11368,352
bottom_category_text,books_movies_and_music.books.literature_and_fi...,books_movies_and_music.books.history_books,books_movies_and_music.books.book_accessories....,books_movies_and_music.books.childrens_books.b...,books_movies_and_music.books.science_and_math_...
description,"Vintage book! 1968 A First Doll Book, &quot; ...",Rouben Mamoulian was an American film and thea...,"A fun little vintage book “No Fighting, No Bit...",Ukrainian book.\n&quot;Magic flashlight&quot;....,PreOrder this imaginative story about the life...


In [None]:
merged_df[['colors','primary_color_text_x','secondary_color_text_x']].head(10)

Unnamed: 0,colors,primary_color_text_x,secondary_color_text_x
0,[],brown,green
1,[],pink,pink
2,[],brown,white
3,[],brown,copper
4,"[green, green, white]",white,blue
5,[],white,other
6,[],brown,black
7,[],brown,black
8,[],other,black
9,[],silver,black


In [None]:
colors_df = merged_df[merged_df['colors'].apply(len) > 0]

In [None]:
colors_df[['colors','primary_color_text_x','secondary_color_text_x']].head()

Unnamed: 0,colors,primary_color_text_x,secondary_color_text_x
4,"[green, green, white]",white,blue
12,[brown],beige,brown
14,"[white, pink, white, pink]",white,pink
17,"[blue, pink, blue, yellow]",silver,silver
18,"[gold, black]",black,white


In [None]:
columns_to_drop = ['primary_color_id_x', 'primary_color_text_x', 'secondary_color_text_x','secondary_color_id_x']  # Replace with your column names
merged_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
merged_df.rename(columns={'primary_color_id_y': 'primary_color_id'}, inplace=True)
merged_df.rename(columns={'primary_color_text_y': 'primary_color_text'}, inplace=True)
merged_df.rename(columns={'secondary_color_text_y': 'secondary_color_text'}, inplace=True)
merged_df.rename(columns={'secondary_color_id_y': 'secondary_color_id'}, inplace=True)

In [None]:
merged_df['color_text'] = merged_df['colors'].apply(lambda x: ' '.join(x))

In [None]:
merged_df_test['color_text'] = merged_df_test['colors'].apply(lambda x: ' '.join(x))

In [None]:
merged_df[merged_df['color_text']!='']

Unnamed: 0.1,Unnamed: 0,product_id,primary_color_rgb,secondary_color_rgb,primary_color_rgb_value,secondary_color_rgb_value,primary_color_rgb_value_average,secondary_color_rgb_value_average,primary_color_rgb_value_weighted,secondary_color_rgb_value_weighted,...,primary_color_id,primary_color_text,secondary_color_id,secondary_color_text,image/encoded,image/width,image/height,text,colors,color_text
4,4,1645635058,"[191, 185, 170]","[87, 69, 54]",12564906,5719350,182.000000,70.000000,185.0649,72.6633,...,17,white,2,blue,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,420,"Wheel of Time, Egwene, Mousepad, WOT mousepad,...","[green, green, white]",green green white
12,12,1062720275,"[159, 31, 40]","[226, 216, 204]",10428200,14866636,76.666667,215.333333,70.2821,217.5994,...,0,beige,4,brown,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,440,412,Butterfly Laser Cut Unfinished Wood Shape DIY ...,[brown],brown
14,14,154761635,"[169, 154, 153]","[77, 61, 47]",11115161,5061935,158.666667,61.666667,158.3541,64.1803,...,17,white,11,pink,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,516,Breast Cancer Awareness White Ladies Sun Visor...,"[white, pink, white, pink]",white pink white pink
17,17,1425149885,"[180, 179, 177]","[121, 117, 108]",11842481,7959916,178.666667,115.333333,179.0530,117.1579,...,16,silver,16,silver,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,578,"Dritz Baby Safe Diaper Pins, 4 piece package, ...","[blue, pink, blue, yellow]",blue pink blue yellow
18,18,1493774255,"[244, 244, 243]","[47, 46, 46]",16053491,3092014,243.666667,46.333333,243.8616,46.2943,...,1,black,17,white,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570,Women Punk Shoes with Party Skull - Canvas sne...,"[gold, black]",gold black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29934,31007,1096637141,"[20, 28, 48]","[105, 130, 155]",1317936,6914715,32.000000,130.000000,27.8860,125.3645,...,2,blue,2,blue,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,561,Navy and Light Blue Double Layer Bow with Diam...,"[blue, blue, blue]",blue blue blue
29935,31008,1438733376,"[253, 253, 253]","[108, 108, 106]",16645629,7105642,253.000000,107.333333,252.9747,107.7612,...,19,other,1,black,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570,"Shelf Bracket - 6 Pack, Shelving Brackets, Flo...","[black, black]",black black
29936,31009,731874729,"[99, 104, 147]","[211, 209, 221]",6514835,13881821,116.666667,213.666667,107.3971,210.9449,...,2,blue,19,other,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,570,Third & Long - Navy Blue and Gray Glitter Nail...,"[clear, blue]",clear blue
29938,31012,788890857,"[220, 225, 217]","[74, 91, 59]",14475737,4873019,220.666667,74.666667,222.5710,82.2616,...,18,yellow,12,purple,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...,570,428,"I wet my PLANTS, House Plant Illustration, Pla...",[white],white


## Model Training

### Using AdaBoost as base Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Example DataFrame df with discrete features X and target variable y
# Assume X contains discrete features and y contains discrete values
X = merged_df[['primary_color_rgb_value_weighted']]  # Replace 'feature1', 'feature2', 'feature3' with your actual feature names
y = merged_df['primary_color_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the AdaBoost model with DecisionTreeClassifier as base estimator
adaboost_model = AdaBoostClassifier(n_estimators=19, random_state=42,learning_rate=0.8)
adaboost_model.fit(X_train, y_train)

# Make predictions
y_pred = adaboost_model.predict(X_test)

# Calculate accuracy
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       283
           1       0.22      0.35      0.27       864
           2       0.13      0.41      0.20       740
           3       0.00      0.00      0.00        65
           4       0.00      0.00      0.00       518
           5       0.00      0.00      0.00       112
           6       0.00      0.00      0.00        51
           7       0.00      0.00      0.00       242
           9       0.00      0.00      0.00       411
          10       0.00      0.00      0.00       134
          11       0.00      0.00      0.00       394
          12       0.00      0.00      0.00       187
          13       0.00      0.00      0.00       109
          14       0.00      0.00      0.00       393
          15       0.00      0.00      0.00        24
          16       0.00      0.00      0.00       299
          17       0.16      0.47      0.24       771
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Training based on colors in text columns and rgb color codes computed - start

For Primary Color ID

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Assuming df is your DataFrame with 'rgbvalue', 'text', and 'primary_color_id' columns

# Split the data into training and testing sets for text classifier
X_text = merged_df['color_text']
y_text = merged_df['primary_color_id']
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size=0.2, random_state=42)

# Train the text classifier
text_vectorizer = TfidfVectorizer()
X_text_train_vec = text_vectorizer.fit_transform(X_text_train)
text_classifier_primary = MultinomialNB()
text_classifier_primary.fit(X_text_train_vec, y_text_train)

# Split the data into training and testing sets for RGB classifier
X_rgb = merged_df[['primary_color_rgb_value_weighted']]
y_rgb = merged_df['primary_color_id']
X_rgb_train, X_rgb_test, y_rgb_train, y_rgb_test = train_test_split(X_rgb, y_rgb, test_size=0.2, random_state=42)

# Train the RGB classifier
rgb_classifier_primary = AdaBoostClassifier(n_estimators=19, random_state=42, learning_rate=0.8)
rgb_classifier_primary.fit(X_rgb_train, y_rgb_train)

# Make predictions using both classifiers
X_text_test_vec = text_vectorizer.transform(X_text_test)
text_predictions = text_classifier_primary.predict(X_text_test_vec)
text_confidences = np.max(text_classifier_primary.predict_proba(X_text_test_vec), axis=1)

rgb_predictions = rgb_classifier_primary.predict(X_rgb_test)
rgb_confidences = np.max(rgb_classifier_primary.predict_proba(X_rgb_test), axis=1)

# Apply prediction with higher confidence for each row
final_predictions = []
for i in range(len(X_text_test)):
    if text_confidences[i] >= rgb_confidences[i]:
        final_predictions.append(text_predictions[i])
    else:
        final_predictions.append(rgb_predictions[i])

# Calculate accuracy

# Print classification report
print("Classification Report:")
print(classification_report(y_text_test, final_predictions))
#accuracy = accuracy_score(y_text_test, final_predictions)
#print("Accuracy:", accuracy)


Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.08      0.13       283
           1       0.18      0.83      0.29       864
           2       0.59      0.26      0.36       740
           3       0.47      0.22      0.29        65
           4       0.44      0.10      0.17       518
           5       0.13      0.14      0.13       112
           6       0.48      0.43      0.45        51
           7       0.31      0.19      0.23       242
           9       0.50      0.19      0.28       411
          10       0.26      0.07      0.12       134
          11       0.51      0.22      0.31       394
          12       0.60      0.17      0.26       187
          13       0.31      0.13      0.18       109
          14       0.47      0.18      0.26       393
          15       0.00      0.00      0.00        24
          16       0.46      0.22      0.30       299
          17       0.32      0.14      0.20       771
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Testing our test data

In [None]:
X_text_test = merged_df_test['color_text']
X_rgb_test = merged_df_test[['primary_color_rgb_value_weighted']]

In [None]:
X_rgb_test=X_rgb_test.fillna(100)

In [None]:
len(X_text_test)

25514

In [None]:
X_text_test_vec = text_vectorizer.transform(X_text_test)
text_predictions = text_classifier_primary.predict(X_text_test_vec)
text_confidences = np.max(text_classifier_primary.predict_proba(X_text_test_vec), axis=1)

rgb_predictions = rgb_classifier_primary.predict(X_rgb_test)
rgb_confidences = np.max(rgb_classifier_primary.predict_proba(X_rgb_test), axis=1)

# Apply prediction with higher confidence for each row
final_predictions = []
for i in range(len(X_text_test)):
    if text_confidences[i] >= rgb_confidences[i]:
        final_predictions.append(text_predictions[i])
    else:
        final_predictions.append(rgb_predictions[i])

In [None]:
len(final_predictions)

25514

In [None]:
merged_df_test['primary_color_id']=final_predictions

In [None]:
merged_df_test.head()

Unnamed: 0.1,Unnamed: 0,product_id,title,tags,type,top_category_id,top_category_text,bottom_category_id,bottom_category_text,description,...,text,colors,primary_color_rgb_value,secondary_color_rgb_value,primary_color_rgb_value_average,secondary_color_rgb_value_average,primary_color_rgb_value_weighted,secondary_color_rgb_value_weighted,color_text,primary_color_id
0,0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"flip book,vintage,Peggy and Me,change dresses",physical,4,books_movies_and_music,2010,books_movies_and_music.books.literature_and_fi...,"Vintage book! 1968 A First Doll Book, &quot; ...",...,A First Doll Book. &quot; Peggy and Me&quot;! ...,[],4597028,11244123,47.666667,136.0,46.746,147.1879,,1
1,1,870948450,"Abigayil: The Story of the Cat at the Manger, ...","one of a kind,authentic,original,gift,special,...",physical,4,books_movies_and_music,345,books_movies_and_music.books.history_books,Rouben Mamoulian was an American film and thea...,...,"Abigayil: The Story of the Cat at the Manger, ...",[],11973017,6443070,170.666667,80.0,175.7408,83.3202,,1
2,2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,4,books_movies_and_music,329,books_movies_and_music.books.book_accessories....,"A fun little vintage book “No Fighting, No Bit...",...,"No Fighting, No Biting! by Else Holmelund Mina...",[],16711422,9604741,254.0,140.333333,253.9746,142.1554,,1
3,3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,4,books_movies_and_music,11368,books_movies_and_music.books.childrens_books.b...,Ukrainian book.\n&quot;Magic flashlight&quot;....,...,Ukrainian book. &quot;Magic flashlight&quot;. ...,[],13423056,5341045,207.0,108.333333,207.3706,112.0979,,1
4,4,1035734450,The Ugly Butterfly,"moth metamorphosis,caterpillar bug,butterfly c...",physical,8,home_and_living,352,books_movies_and_music.books.science_and_math_...,PreOrder this imaginative story about the life...,...,The Ugly Butterfly PreOrder this imaginative s...,[],7897442,5130282,115.666667,64.0,122.763,70.3662,,1


For secondary color ID

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Assuming df is your DataFrame with 'rgbvalue', 'text', and 'primary_color_id' columns

# Split the data into training and testing sets for text classifier
X_text = merged_df['color_text']
y_text = merged_df['secondary_color_id']
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size=0.2, random_state=42)

# Train the text classifier
text_vectorizer = TfidfVectorizer()
X_text_train_vec = text_vectorizer.fit_transform(X_text_train)
text_classifier_secondary = MultinomialNB()
text_classifier_secondary.fit(X_text_train_vec, y_text_train)

# Split the data into training and testing sets for RGB classifier
X_rgb = merged_df[['secondary_color_rgb_value_weighted']]
y_rgb = merged_df['secondary_color_id']
X_rgb_train, X_rgb_test, y_rgb_train, y_rgb_test = train_test_split(X_rgb, y_rgb, test_size=0.2, random_state=42)

# Train the RGB classifier
rgb_classifier_secondary = AdaBoostClassifier(n_estimators=19, random_state=42, learning_rate=0.8)
rgb_classifier_secondary.fit(X_rgb_train, y_rgb_train)

# Make predictions using both classifiers
X_text_test_vec = text_vectorizer.transform(X_text_test)
text_predictions = text_classifier_secondary.predict(X_text_test_vec)
text_confidences = np.max(text_classifier_secondary.predict_proba(X_text_test_vec), axis=1)

rgb_predictions = rgb_classifier_secondary.predict(X_rgb_test)
rgb_confidences = np.max(rgb_classifier_secondary.predict_proba(X_rgb_test), axis=1)

# Apply prediction with higher confidence for each row
final_predictions = []
for i in range(len(X_text_test)):
    if text_confidences[i] >= rgb_confidences[i]:
        final_predictions.append(text_predictions[i])
    else:
        final_predictions.append(rgb_predictions[i])

# Print classification report
print("Classification Report:")
print(classification_report(y_text_test, final_predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.06      0.11       281
           1       0.30      0.19      0.24       795
           2       0.33      0.15      0.21       438
           3       0.29      0.11      0.16        73
           4       0.31      0.08      0.13       413
           5       0.00      0.00      0.00       103
           6       0.36      0.36      0.36        42
           7       0.45      0.21      0.28       399
           9       0.41      0.14      0.21       420
          10       0.40      0.11      0.17       160
          11       0.36      0.14      0.21       326
          12       0.38      0.09      0.14       157
          13       0.21      0.06      0.10       161
          14       0.25      0.08      0.12       379
          15       0.00      0.00      0.00        21
          16       0.41      0.19      0.26       366
          17       0.18      0.74      0.29       991
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_text_test = merged_df_test['color_text']
X_rgb_test = merged_df_test[['secondary_color_rgb_value_weighted']]

In [None]:
X_rgb_test=X_rgb_test.fillna(100)

In [None]:
len(X_text_test)

25514

In [None]:
X_text_test_vec = text_vectorizer.transform(X_text_test)
text_predictions = text_classifier_secondary.predict(X_text_test_vec)
text_confidences = np.max(text_classifier_secondary.predict_proba(X_text_test_vec), axis=1)

rgb_predictions = rgb_classifier_secondary.predict(X_rgb_test)
rgb_confidences = np.max(rgb_classifier_secondary.predict_proba(X_rgb_test), axis=1)

# Apply prediction with higher confidence for each row
final_predictions = []
for i in range(len(X_text_test)):
    if text_confidences[i] >= rgb_confidences[i]:
        final_predictions.append(text_predictions[i])
    else:
        final_predictions.append(rgb_predictions[i])

In [None]:
merged_df_test['secondary_color_id']=final_predictions

In [None]:
merged_df_test.head()

Unnamed: 0.1,Unnamed: 0,product_id,title,tags,type,top_category_id,top_category_text,bottom_category_id,bottom_category_text,description,...,colors,primary_color_rgb_value,secondary_color_rgb_value,primary_color_rgb_value_average,secondary_color_rgb_value_average,primary_color_rgb_value_weighted,secondary_color_rgb_value_weighted,color_text,primary_color_id,secondary_color_id
0,0,815216520,A First Doll Book. &quot; Peggy and Me&quot;!,"flip book,vintage,Peggy and Me,change dresses",physical,4,books_movies_and_music,2010,books_movies_and_music.books.literature_and_fi...,"Vintage book! 1968 A First Doll Book, &quot; ...",...,[],4597028,11244123,47.666667,136.0,46.746,147.1879,,1,17
1,1,870948450,"Abigayil: The Story of the Cat at the Manger, ...","one of a kind,authentic,original,gift,special,...",physical,4,books_movies_and_music,345,books_movies_and_music.books.history_books,Rouben Mamoulian was an American film and thea...,...,[],11973017,6443070,170.666667,80.0,175.7408,83.3202,,1,17
2,2,1111113690,"No Fighting, No Biting! by Else Holmelund Mina...","Else Minarik,Maurice Sendak,Vintage Children,L...",physical,4,books_movies_and_music,329,books_movies_and_music.books.book_accessories....,"A fun little vintage book “No Fighting, No Bit...",...,[],16711422,9604741,254.0,140.333333,253.9746,142.1554,,1,17
3,3,1510466600,Ukrainian book. &quot;Magic flashlight&quot;. ...,"Books,Ukrainian,New,Gift,Colorful,Best,Stories...",physical,4,books_movies_and_music,11368,books_movies_and_music.books.childrens_books.b...,Ukrainian book.\n&quot;Magic flashlight&quot;....,...,[],13423056,5341045,207.0,108.333333,207.3706,112.0979,,1,17
4,4,1035734450,The Ugly Butterfly,"moth metamorphosis,caterpillar bug,butterfly c...",physical,8,home_and_living,352,books_movies_and_music.books.science_and_math_...,PreOrder this imaginative story about the life...,...,[],7897442,5130282,115.666667,64.0,122.763,70.3662,,1,17


In [None]:
student_id = "23263834"
merged_df_test[
    ["product_id", "top_category_id", "bottom_category_id", "primary_color_id", "secondary_color_id"]
].to_parquet(f"/kaggle/working/predictions_{student_id}.parquet")