In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

train_csv = pd.read_parquet('/content/drive/MyDrive/train_processed.parquet')
test_csv = pd.read_parquet('/content/drive/MyDrive/test_processed.parquet')

In [3]:
!pip install pytorch_lightning
!pip install category_encoders
!pip install feature_engine

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.0.5-py3-none-any.whl (722 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m722.4/722.4 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.0.0-py3-none-any.whl (728 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m728.8/728.8 kB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.9.0 pytorch_lightning-2.0.5 torchmetrics-1.0.0
Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages:

In [4]:
#keras tokenizer
from keras.preprocessing import text
from keras.preprocessing import sequence # for import pad_sequences

import torch
import torch.nn as nn
import torchtext

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import ModelSummary, LearningRateMonitor

from sklearn import model_selection
import joblib


import socket
import re
import gc
import random
import numpy as np

is_cuda = torch.cuda.is_available()


if is_cuda:
    print(is_cuda)
    print(torch.cuda.current_device())
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

else:
    device = torch.device("cpu")


print('Using device:', device)
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True



import category_encoders as ce
import feature_engine.encoding as fe
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from sklearn.pipeline import Pipeline

import sklearn.metrics as metrics

True
0
1
Tesla T4
Using device: cuda


In [5]:
columns_numerical = ['is_title_missing' , 'is_description_missing' , 'is_bullet_missing' , 'Desc_len' , 'title_len' , 'Bullet_len']

encoder_numerical = Pipeline([
    ("selector", ColumnTransformer([("selector", "passthrough", columns_numerical)], remainder="drop")),
    ("normalizer", StandardScaler())
])

catgorical_columns = ['PRODUCT_TYPE_ID']

columns_text = ['TITLE' , 'DESCRIPTION' , 'BULLET_POINTS']

for num in columns_numerical:
    print(num)
    if train_csv[num].dtype != np.float32:
        print(f"converting {num} to float32")
        train_csv[num] = train_csv[num].astype(np.float32)

is_title_missing
converting is_title_missing to float32
is_description_missing
converting is_description_missing to float32
is_bullet_missing
converting is_bullet_missing to float32
Desc_len
converting Desc_len to float32
title_len
converting title_len to float32
Bullet_len
converting Bullet_len to float32


In [6]:
train_csv['PRODUCT_TYPE_ID'] = train_csv['PRODUCT_TYPE_ID'].astype('category')
test_csv['PRODUCT_TYPE_ID'] = test_csv['PRODUCT_TYPE_ID'].astype('category')

In [7]:
train_csv['length_log'] = np.log1p(train_csv['PRODUCT_LENGTH'])

In [8]:
Train , Val_Test = model_selection.train_test_split(train_csv , test_size = 0.2 , random_state = 42)
Validation , Test = model_selection.train_test_split(Train , test_size = 0.2 , random_state = 42)

In [9]:
target = 'length_log'

X_train = Train
Y_train = Train[target]

X_Validation = Validation
Y_Validation = Validation[target]

X_Test = Test
Y_Test = Test[target]

In [10]:
import dataset

In [11]:
target = 'length_log'
dd_train, dd_validation = dataset.build_pytorch_dataset(X_train,
                                         X_Validation,
                                         encoder_numerical = encoder_numerical,
                                         categorical_names = catgorical_columns,
                                         text_names = columns_text,
                                         char_names = columns_text,
                                         encoder_target = PowerTransformer(method = "box-cox"),
                                         target_name = target,
                                         verbose = True)

dd_test = dataset.build_test_dataset(dd_train , X_Test , verbose = True)

target: length_log
train: (1799758, 13)
test: (1439806, 13)
train set mode
=> target encoding
=> numerical encoding
=> categorical encoding
=> tokenizing TITLE
==> TITLE vocabulary size 768429 
=> tokenizing DESCRIPTION
==> DESCRIPTION vocabulary size 578497 
=> tokenizing BULLET_POINTS
==> BULLET_POINTS vocabulary size 473100 
=> tokenizing chars TITLE
==> TITLE vocabulary size 2333 
=> tokenizing chars DESCRIPTION
==> DESCRIPTION vocabulary size 4886 
=> tokenizing chars BULLET_POINTS
==> BULLET_POINTS vocabulary size 3154 
target min, max range (-4.826471725694244, 20.179163219126)
test set mode
=> target encoding
=> numerical encoding
=> categorical encoding
TITLE vocabulary size 768429
DESCRIPTION vocabulary size 578497
BULLET_POINTS vocabulary size 473100
TITLE vocabulary size 2333
DESCRIPTION vocabulary size 4886
BULLET_POINTS vocabulary size 3154
target min, max range (-4.826471725694244, 17.99748060663224)
target: length_log
train: 1799758
test: (359952, 13)
test set mode
=> t

In [28]:
del X_train , X_Test , X_Validation
del Train , Val_Test
del Validation , Test
del train_csv

In [12]:
import model

In [25]:
batch_size = 300
train_loader = DataLoader(dd_train, shuffle = True, batch_size = batch_size, collate_fn = dataset.pytorch_collate_fn)
validation_loader = DataLoader(dd_validation, shuffle = False, batch_size = batch_size, collate_fn = dataset.pytorch_collate_fn)

In [29]:
del dd_train , dd_validation

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
gc.collect()
torch.cuda.empty_cache()

is_target_log = True
import torchmetrics as tm
metric_to_monitor = "rmsle"
metric = tm.MeanSquaredError(squared=False) if is_target_log else tm.MeanSquaredLogError()

Model = model.PytorchModel(target_encoder = dd_train.get_encoder_target(),
                                            is_target_log = True,
                                            optimizer = "Adam",
                                            metric_to_monitor = metric_to_monitor,
                                            numerical_input_size=dd_train.get_data_numerical().shape[1],
                                            numerical_batch_normalization = True,
                                            categorical_embedding_size=dd_train.get_data_categorical_embedding_sizes(),
                                            categorical_embedding_dropout = 0.4,
                                            text_as_embedding_bag = False,
                                            text_as_embedding_bag_mode = "mean",
                                            text_vocabulary_size = dd_train.get_text_vocabulary_size(),
                                            text_embedding_dimension = 50,
                                            text_bidirectional = True,
                                            text_recurrent_hidden_size = 100,
                                            text_recurrent_layers = 2,
                                            text_rnn = "GRU",
                                            char_vocabulary_size = dd_train.get_char_vocabulary_size(),
                                            char_embedding_dimension = 40,
                                            char_bidirectional = False,
                                            char_recurrent_hidden_size = 50,
                                            char_recurrent_layers = 1,
                                            char_rnn = "LSTM",
                                            linear_layer_skip_connections = (3, ([1024], [0.3])),
                                            linear_layers = ([512], [0.2]),
                                            linear_layer_normalization = "BatchNorm1d",
                                            normalization_before_activation = True,
                                            linear_layer_activation = nn.ReLU(inplace=True),
                                            final_linear_layer=True,
                                            final_normalization = False,
                                            loss_function = nn.MSELoss(),
                                            learning_rate = 0.001,
                                            verbose = True
                              )
Model

"categorical_embedding_dropout":   0.4
"categorical_embedding_size":      [(12489, 315)]
"char_bidirectional":              False
"char_embedding_dimension":        40
"char_recurrent_hidden_size":      50
"char_recurrent_layers":           1
"char_rnn":                        LSTM
"char_vocabulary_size":            {'TITLE': 2333, 'DESCRIPTION': 4886, 'BULLET_POINTS': 3154}
"final_linear_layer":              True
"final_normalization":             False
"is_target_log":                   True
"learning_rate":                   0.001
"linear_layer_activation":         ReLU(inplace=True)
"linear_layer_normalization":      BatchNorm1d
"linear_layer_skip_connections":   (3, ([1024], [0.3]))
"linear_layers":                   ([512], [0.2])
"loss_function":                   MSELoss()
"metric_to_monitor":               rmsle
"normalization_before_activation": True
"numerical_batch_normalization":   True
"numerical_input_size":            6
"optimizer":                       Adam
"pretraine

PytorchModel(
  (metric): MeanSquaredError()
  (loss_function): MSELoss()
  (embeds): ModuleList(
    (0): Embedding(12489, 315, padding_idx=0)
  )
  (categorical_dropout): Dropout(p=0.4, inplace=False)
  (batch_normalization_numerical): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (text_embeddings): ModuleList(
    (0): TextRecurrentLayer(
      (embedding): Embedding(768429, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
    (1): TextRecurrentLayer(
      (embedding): Embedding(578497, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
    (2): TextRecurrentLayer(
      (embedding): Embedding(473100, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
  )
  (char_embeddings): M

In [17]:
model_dump_path = '/content/drive/MyDrive/AmazonData/model'

In [23]:

# early_stop_callback = EarlyStopping(min_delta=0.00, patience=5, mode="min", verbose = True)

checkpoint_filename = "epoch{epoch:02d}-loss{loss:.2f}-val_loss{val_loss:.2f}"
model_checkpoint_callback = ModelCheckpoint(monitor = metric_to_monitor , filename=checkpoint_filename,
                                            auto_insert_metric_name=False,
                                            dirpath=model_dump_path,
                                            save_weights_only = False,
                                            verbose = True)


epochs = 2
enable_model_summary = False

print(f"epochs: {epochs}")

#limit_train_batches=0.1
trainer = pl.Trainer(precision= '16-mixed',
                     accelerator = "auto",
                     devices = 1,
                     enable_checkpointing = True,
                     check_val_every_n_epoch  = 1,
                     max_epochs=epochs,
                     enable_model_summary = enable_model_summary,
                     default_root_dir = "./",
                     enable_progress_bar = True,
                     deterministic = False,
                     callbacks=[model_checkpoint_callback])

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


epochs: 2


In [20]:
train_csv['length_log'].dtype

dtype('float64')

In [24]:
torch.cuda.empty_cache()

In [None]:
trainer.fit(Model, train_dataloaders=train_loader , val_dataloaders  = validation_loader)