## E-commerce classifier
### Instruction
I have ran this on my M1 macbook pro 13.3 inch with GPU optimization in jupyter notebook, the data like train.csv should be in the same directory as the kaggle ipynb file

### Import

In [78]:
import os
import sys
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Reshape

### Data preprocess

In [79]:
# load data
print("Loading data...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Check for GPU availability
import tensorflow as tf

cuda = tf.config.list_physical_devices('GPU')
device = '/gpu:0' if cuda else '/cpu:0'

# preprocess categorical features
def preprocess_categorical_features(df):
    categorical_columns = ['gender', 'baseColour', 'season', 'usage']
    for column in categorical_columns:
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column].astype(str))
    return df

print("Preprocessing categorical features...")
train_df = preprocess_categorical_features(train_df)
test_df = preprocess_categorical_features(test_df)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text_data(df):
    print("Preprocessing text data...")
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Convert to lowercase, remove special characters and numbers, tokenize, remove stopwords, and lemmatize
    df['processedText'] = df['noisyTextDescription'].str.lower()
    df['processedText'] = df['processedText'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
    df['processedText'] = df['processedText'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(x) if w not in stop_words]))

    vectorizer = TfidfVectorizer(max_features=1000)
    text_features = vectorizer.fit_transform(df['processedText'])
    text_features_df = pd.DataFrame(text_features.toarray(), columns=vectorizer.get_feature_names_out())

    return text_features_df

train_text_features = preprocess_text_data(train_df)
test_text_features = preprocess_text_data(test_df)


# preprocess image data
def preprocess_image_data(df, image_folder_path):
    print("Preprocessing image data...")
    image_data = []
    for image_id in tqdm(df['id'], total=df.shape[0]):
        image_path = os.path.join(image_folder_path, f"{image_id}.jpg")
        image = cv2.imread(image_path)
        image = cv2.resize(image, (60, 80)) / 255.0
        image_data.append(image.flatten())
    image_data_np = np.array(image_data)
    image_data_df = pd.DataFrame(image_data_np)
    return image_data_df

train_image_features = preprocess_image_data(train_df, 'noisy-images/noisy-images')
test_image_features = preprocess_image_data(test_df, 'noisy-images/noisy-images')


# DAE for image features
def create_dae(input_dim):
    dae_input = Input(shape=(input_dim,))
    x = Dense(1024, activation='relu')(dae_input)
    x = Dense(512, activation='relu')(x)
    x = Dense(1024, activation='relu')(x)
    dae_output = Dense(input_dim, activation='sigmoid')(x)
    dae = Model(dae_input, dae_output)
    dae.compile(optimizer='adam', loss='mse')
    return dae

# Train DAE for image features
print("Training DAE for image features...")
image_dae = create_dae(train_image_features.shape[1])
with tf.device(device):
    image_dae.fit(train_image_features, train_image_features, epochs=10, batch_size=128)


Loading data...
Preprocessing categorical features...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shonnli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shonnli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shonnli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing text data...
Preprocessing text data...
Preprocessing image data...


100%|███████████████████████████████████| 21627/21627 [00:06<00:00, 3525.87it/s]


Preprocessing image data...


100%|███████████████████████████████████| 21628/21628 [00:06<00:00, 3443.11it/s]


Training DAE for image features...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### data spliting for training, validation, testing

In [80]:
# Split the data into train and validation sets
print("Splitting the data into train and validation sets...")
train_df, val_df, train_image_features, val_image_features, train_text_features, val_text_features = train_test_split(
    train_df, 
    train_image_features, 
    train_text_features,
    test_size=0.1, 
    random_state=42, 
    stratify=train_df["category"]
)

# Get the target labels for train and validation sets
print("Processing target labels for train and validation sets...")
y_train = train_df["category"]

X_val = val_df.drop(columns=["id", "category", "noisyTextDescription"])
y_val = val_df["category"]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# One-hot encode target labels
y_train_categorical = to_categorical(y_train)
y_val_categorical = to_categorical(y_val)

# Preprocess for CNN, Bert
print("Preparing data for CNN and LSTM...")
X_train_img_array = np.array(train_image_features).reshape(-1, 60, 80, 3)
X_val_img_array = np.array(val_image_features).reshape(-1, 60, 80, 3)

# Prepare data for Bert
X_train_text = train_text_features
X_val_text = val_text_features

print("X_train_img_array shape:", X_train_img_array.shape)
print("y_train_categorical shape:", y_train_categorical.shape)
print("X_val_img_array shape:", X_val_img_array.shape)
print("y_val_categorical shape:", y_val_categorical.shape)


Splitting the data into train and validation sets...
Processing target labels for train and validation sets...
Preparing data for CNN and LSTM...
X_train_img_array shape: (19464, 60, 80, 3)
y_train_categorical shape: (19464, 27)
X_val_img_array shape: (2163, 60, 80, 3)
y_val_categorical shape: (2163, 27)


### Train Image

In [81]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

class CNN:
    def __init__(self, input_shape, num_classes):
        self.model = Sequential()
        self.model.add(InputLayer(input_shape=input_shape))
        self.model.add(Reshape((60, 80, 3), input_shape=input_shape))
        self.model.add(Conv2D(32, kernel_size=(3, 3), activation="relu"))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Conv2D(64, kernel_size=(3, 3), activation="relu"))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Conv2D(128, kernel_size=(3, 3), activation="relu"))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Conv2D(256, kernel_size=(3, 3), activation="relu"))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Flatten())
        self.model.add(Dense(512, activation="relu"))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(num_classes, activation="softmax"))

        self.model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

    def train(self, X_train, y_train, X_val, y_val, epochs, batch_size):
        def lr_schedule(epoch, lr):
            if epoch % 10 == 0 and epoch > 0:
                lr = lr * 0.1
            return lr
        
        lr_callback = LearningRateScheduler(lr_schedule)
        self.model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[lr_callback])

    def predict(self, X_test):
        return self.model.predict(X_test)

# Train CNN for images
print("X_train_img_array shape:", X_train_img_array.shape)
print("y_train_categorical shape:", y_train_categorical.shape)
print("X_val_img_array shape:", X_val_img_array.shape)
print("y_val_categorical shape:", y_val_categorical.shape)
# Train CNN for images
cnn_model = CNN((60, 80, 3), len(np.unique(y_train)))
with tf.device(device):
    cnn_model.train(X_train_img_array, y_train_categorical, X_val_img_array, y_val_categorical, epochs=15, batch_size=128)


X_train_img_array shape: (19464, 60, 80, 3)
y_train_categorical shape: (19464, 27)
X_val_img_array shape: (2163, 60, 80, 3)
y_val_categorical shape: (2163, 27)




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Train Text

In [82]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import TFBertModel, BertTokenizer, BertConfig

from transformers import TFDistilBertModel, DistilBertTokenizer, DistilBertConfig

# Load BERT tokenizer and BERT model
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=bert_config)

# Preprocess text data with BERT tokenizer
def bert_encode(texts, tokenizer, max_len=64):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        tokens = tokenizer.encode_plus(text, max_length=max_len, truncation=True,
                                       padding='max_length', add_special_tokens=True,
                                       return_attention_mask=True, return_tensors='tf')
        input_ids.append(tokens['input_ids'][0])  # Change this line
        attention_masks.append(tokens['attention_mask'][0])  # Change this line

    return np.array(input_ids), np.array(attention_masks)

# Define BERT-based model
def build_bert_model(bert_model, num_classes):
    input_ids = Input(shape=(64,), dtype=tf.int32, name='input_ids')
    attention_masks = Input(shape=(64,), dtype=tf.int32, name='attention_masks')

    bert_output = bert_model(input_ids, attention_mask=attention_masks)[0]  # Change the index to 0
    x = Dropout(0.2)(bert_output[:, 0, :])
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    output = Dense(num_classes, activation='softmax')(x)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)  # Update the variable name here
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='categorical_crossentropy', metrics=['accuracy'])

    return model


# Preprocess text data
train_texts = train_df['noisyTextDescription'].values
val_texts = val_df['noisyTextDescription'].values
X_train_text_ids, X_train_text_masks = bert_encode(train_texts, bert_tokenizer)
X_val_text_ids, X_val_text_masks = bert_encode(val_texts, bert_tokenizer)

# Train BERT-based model
print("Training BERT-based model...")
bert_based_model = build_bert_model(bert_model, len(np.unique(y_train)))
history = bert_based_model.fit([X_train_text_ids, X_train_text_masks], y_train_categorical,
                               validation_data=([X_val_text_ids, X_val_text_masks], y_val_categorical),
                               epochs=3, batch_size=16,
                               callbacks=[tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)])

# Print training progress
print("\nTraining history:")
for key, values in history.history.items():
    print(f"{key}: {values}")

print('\nBERT model training finished')




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Training BERT-based model...




Epoch 1/3
Epoch 2/3
Epoch 3/3

Training history:
loss: [1.5125396251678467, 1.065078854560852, 0.9396510124206543]
accuracy: [0.644112229347229, 0.7712700366973877, 0.7964960932731628]
val_loss: [1.0723285675048828, 0.9879311323165894, 0.9584036469459534]
val_accuracy: [0.7609801292419434, 0.7877947092056274, 0.7933425903320312]

BERT model training finished


### Train Categorical Features

In [83]:
categorical_columns = ['gender', 'baseColour', 'season', 'usage']
X_train_gb = train_df[categorical_columns]
X_val_gb = val_df[categorical_columns]
X_test_gb = test_df[categorical_columns]

print("X_train_gb shape:", X_train_gb.shape)
print("y_train shape:", y_train.shape)
print("X_val_gb shape:", X_val_gb.shape)
print("X_test_gb shape:", X_test_gb.shape)

# Train Gradient Boosting for categorical features
gb_model = GradientBoostingClassifier()
print("start GB fitting")
gb_model.fit(X_train_gb, y_train)
print("finished GB fitting")

accuracy = gb_model.score(X_val_gb, y_val)
print("Validation accuracy:", accuracy)

from sklearn.metrics import log_loss


y_pred = gb_model.predict_proba(X_val_gb)
loss = log_loss(y_val, y_pred)
print("Validation loss:", loss)


X_train_gb shape: (19464, 4)
y_train shape: (19464,)
X_val_gb shape: (2163, 4)
X_test_gb shape: (21628, 4)
start GB fitting
finished GB fitting
Validation accuracy: 0.5376791493296348
Validation loss: 1.4741624269915423


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from tqdm.auto import tqdm
from sklearn.utils import shuffle

categorical_columns = ['gender', 'baseColour', 'season', 'usage']
X_train_gb = train_df[categorical_columns]
X_val_gb = val_df[categorical_columns]
X_test_gb = test_df[categorical_columns]

print("X_train_gb shape:", X_train_gb.shape)
print("y_train shape:", y_train.shape)
print("X_val_gb shape:", X_val_gb.shape)
print("X_test_gb shape:", X_test_gb.shape)

# Hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3]
}

gb_model = GradientBoostingClassifier()

# Randomized search
random_search = RandomizedSearchCV(gb_model, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, verbose=1)
print("start GB fitting")
random_search.fit(X_train_gb, y_train)
print("finished GB fitting")

# Best model
best_gb_model = random_search.best_estimator_

# Training the best model with a progress bar
n_estimators = best_gb_model.n_estimators
best_gb_model.n_iter_no_change = 10
best_gb_model.warm_start = True

# Shuffle the dataset
X_train_gb_shuffled, y_train_shuffled = shuffle(X_train_gb, y_train, random_state=42)

# Initialize the progress bar
progress_bar = tqdm(range(n_estimators), desc="Training progress")

# Train the model incrementally and update the progress bar
for i in progress_bar:
    best_gb_model.fit(X_train_gb_shuffled, y_train_shuffled)
    progress_bar.set_postfix_str(f"Training accuracy: {best_gb_model.train_score_[-1]:.4f}")
    if best_gb_model.n_estimators_ < i + 1:
        break
progress_bar.close()

# Evaluate the model
accuracy = best_gb_model.score(X_val_gb, y_val)
print("Validation accuracy:", accuracy)

y_pred = best_gb_model.predict_proba(X_val_gb)
loss = log_loss(y_val, y_pred)
print("Validation loss:", loss)


### Data preprocessing and training

In [None]:
test_texts = test_df['noisyTextDescription'].values
X_test_text_ids, X_test_text_masks = bert_encode(test_texts, bert_tokenizer)
# Ensemble predictions
print("Predicting with CNN...")
cnn_pred = cnn_model.model.predict(X_val_img_array)
print("Predicting with BERT...")
bert_pred = bert_based_model.predict([X_val_text_ids, X_val_text_masks])
print("Predicting with Gradient Boosting...")
gb_pred = gb_model.predict_proba(X_val_gb)
weights = [0.382, 0.382, 0.236]
print("Ensembling predictions with weighted averaging...")
ensemble_pred = weights[0] * cnn_pred + weights[1] * bert_pred + weights[2] * gb_pred

# Convert probabilities to class labels
ensemble_pred_labels = np.argmax(ensemble_pred, axis=1)

# Calculate ensemble accuracy
ensemble_accuracy = accuracy_score(y_val, ensemble_pred_labels)
print(f"Ensemble accuracy: {ensemble_accuracy}")

# Make predictions on test set
print("Predicting test set with CNN...")
cnn_test_pred = cnn_model.model.predict(test_image_features.to_numpy().reshape(-1, 60, 80, 3))
print("Predicting with BERT...")
bert_test_pred = bert_based_model.predict([X_test_text_ids, X_test_text_masks])
print("Predicting test set with Gradient Boosting...")
gb_test_pred = gb_model.predict_proba(X_test_gb)

print("Ensembling test set predictions...")
ensemble_test_pred = weights[0] * cnn_test_pred + weights[1] * bert_test_pred + weights[2] * gb_test_pred

# Convert probabilities to class labels
ensemble_test_labels = np.argmax(ensemble_test_pred, axis=1)

# Save predictions to CSV
print("Saving predictions to result.csv...")
test_df['category'] = label_encoder.inverse_transform(ensemble_test_labels)
test_df[['id', 'category']].to_csv('result.csv', index=False)
print("Done.")


Predicting with CNN...
Predicting with BERT...

### Ensemble prediction

In [17]:
import tensorflow as tf
import os

os.environ['TF_DETERMINISTIC_OPS'] = '1'
import tensorflow as tf
print(tf.config.list_physical_devices())
# Check if TensorFlow is using the M1 Neural Engine
print(tf.config.list_physical_devices('CPU'))
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_physical_devices('MLC'))

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
[]
[]
