In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")

import nltk
import re
import string
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
print(pd.__version__)

2.1.4


In [3]:
df = pd.read_csv('/content/train.csv', engine='python', on_bad_lines='skip', quotechar='"')


In [4]:
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [5]:
df.shape

(161198, 6)

In [6]:
df['BULLET_POINTS'].iloc[0]

'[LUXURIOUS & APPEALING: Beautiful custom-made curtains to decorate any home or office | Includes inbuilt tieback to hold the curtain | Completely finished and ready to hang on walls & windows,MATERIAL: Luxurious & versatile fabric with a natural finish | High colour fastness | State-of-the-art digital printing ensures colour consistency and prevents any fading | Eyelets; Cotton Canvas; Width 4.5feet (54inch) | Multicolour | PACKAGE: 2 Room Curtains Eyelets | SIZE: Height 5 feet (60 inch); SET OF 2 PCS,BLACKOUT CURTAIN: 100% opaque & heavy premium cotton canvas fabric | Tight knitted, long life & durable fabric | Printing only on front side with a plain colour back side,MADE TO PERFECTION: Large eyelets at the top to put hanging hooks | Perfectly tailored seams for durability | Refined stitching with a matching thread color,QUALITY ASSURED: Gentle wash with similar colors in cold water | Avoid direct sunlight to prevent fading | Dispatched after MULTIPLE QUALITY CHECKS]'

In [7]:
df["PRODUCT_ID"].nunique()

161198

In [8]:
df.shape

(161198, 6)

In [9]:
## We can make PRODUCT_ID as index
df.set_index('PRODUCT_ID', inplace=True)

In [10]:
df.index
## we updated the indexes of dataframe

Index([1925202, 2673191, 2765088, 1594019,  283658, 2152929,  413758, 2026580,
       2050239, 2998633,
       ...
       1114889, 1881000, 2442353, 2165496,  547421,  576481, 1550875, 1628880,
       2818107, 1106389],
      dtype='int64', name='PRODUCT_ID', length=161198)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161198 entries, 1925202 to 1106389
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   TITLE            161197 non-null  object 
 1   BULLET_POINTS    101128 non-null  object 
 2   DESCRIPTION      78068 non-null   object 
 3   PRODUCT_TYPE_ID  161198 non-null  int64  
 4   PRODUCT_LENGTH   161198 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 7.4+ MB


In [12]:
def df_information(data):
  shape         = data.shape
  info          = data.info()
  describe      = data.describe()
  column_name   = data.columns
  null_values   = data.isnull().sum()

  return shape, info, describe, column_name, null_values

In [13]:
df.isna().sum()

Unnamed: 0,0
TITLE,1
BULLET_POINTS,60070
DESCRIPTION,83130
PRODUCT_TYPE_ID,0
PRODUCT_LENGTH,0


In [14]:
## Joining the columns which are the inputs
new_df = df.copy()
# need to fill the nan values else it gonna give error
new_df["inputs"] = df[["TITLE", "BULLET_POINTS", "DESCRIPTION"]].fillna(" ").apply(lambda x: ' '.join(x), axis=1)

In [15]:
new_df.head()

Unnamed: 0_level_0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98,ArtzFolio Tulip Flowers Blackout Curtain for D...
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424,The United Empire Loyalists: A Chronicle of th...


In [16]:
new_df = new_df.drop(["TITLE", "BULLET_POINTS", "DESCRIPTION", "PRODUCT_TYPE_ID"], axis=1)

In [17]:
new_df.head()

Unnamed: 0_level_0,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1925202,2125.98,ArtzFolio Tulip Flowers Blackout Curtain for D...
2673191,393.7,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2765088,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
1594019,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
283658,598.424,The United Empire Loyalists: A Chronicle of th...


In [18]:
new_df.dropna(axis=0, inplace=True)

In [19]:
new_df.isna().sum()

Unnamed: 0,0
PRODUCT_LENGTH,0
inputs,0


In [20]:
new_df.shape

(161198, 2)

In [21]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
def remove_emojis_and_symbols(text):
    # This regex targets most emojis, hearts, and various symbols
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & Map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols (includes hearts)
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

In [23]:
# will start preprocessing the inputs
def preprocessing(data):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    def clean_sentence(sentence):
        # Convert to lowercase
        sentence = sentence.lower()

        # Remove non-alphanumeric characters
        sentence = re.sub("[^a-z0-9]", " ", sentence)

        # Remove URLs
        sentence = re.sub(r'(http|https|ftp|ssh)://[\w_-]+(?:\.[\w_-]+)+[\w.,@?^=%&/~+#-]*[\w@?^=%&/~+#-]?', '', sentence)

        # removing emojis and other symobls that are not cleaned
        sentence = remove_emojis_and_symbols(sentence)

        # Remove stopwords
        sentence = " ".join([word for word in sentence.split() if word not in stop_words])

        # Clean any potential HTML tags
        sentence = BeautifulSoup(sentence, "lxml").get_text()

        # Lemmatize words
        sentence = " ".join([lemmatizer.lemmatize(word) for word in sentence.split()])

        return sentence

    # Apply the cleaning function to each row
    data['inputs'] = data['inputs'].apply(clean_sentence)

    return data

In [24]:
cleaned_df     = preprocessing(new_df)

In [25]:
cleaned_df.head()

Unnamed: 0_level_0,PRODUCT_LENGTH,inputs
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1925202,2125.98,artzfolio tulip flower blackout curtain door w...
2673191,393.7,mark spencer girl pyjama set t86 2561c navy mi...
2765088,748.031495,priknik horn red electric air horn compressor ...
1594019,787.401574,alishah woman cotton ankle length legging comb...
283658,598.424,united empire loyalist chronicle great migration


In [26]:
cleaned_df.shape

(161198, 2)

In [27]:
def vocabulary(df):
    words = [word for sentence in df.inputs for word in sentence.split()]
    word_counts = Counter(words)
    unique_words = set(word_counts)
    sorted_words = word_counts.most_common()

    return sorted_words, unique_words

In [28]:
word_counts, unique_words = vocabulary(cleaned_df)

In [29]:
# word_counts

In [30]:
len(unique_words)

210495

In [31]:
words = []
for i, (w, c) in enumerate(word_counts):
  if c < 5:
    words.append(w)

In [32]:
len(words)

163670

In [33]:
def remove_rare_wors(word_counts):
  word_list = []
  for i, (word, cout) in enumerate(word_counts):
      if word.isdigit():
        continue
      elif cout > 5:
        word_list.append(word)

  return word_list

In [34]:
clenaed_vocabulary_   = remove_rare_wors(word_counts)

In [35]:
type(45)

int

In [36]:
len(clenaed_vocabulary_)

39871

In [37]:
# clenaed_vocabulary_

In [38]:
def word_to_int(clenaed_vocabulary_):
  word_to_numeric = {}
  for i, word in enumerate(clenaed_vocabulary_):
    word_to_numeric[word] = i + 1

  return word_to_numeric

In [39]:
word_to_numeric   = word_to_int(clenaed_vocabulary_)

In [40]:
# word_to_numeric

In [41]:
def sentence_to_int(data):
  data["inputs"] = data["inputs"].apply(lambda x: [word_to_numeric[word] for word in x.split() if word in clenaed_vocabulary_])

  return data

In [None]:
cleaned_df_to_numeric = sentence_to_int(cleaned_df)

In [None]:
cleaned_df_to_numeric["length"] = cleaned_df_to_numeric["inputs"].apply(lambda x: len(x))

In [None]:
cleaned_df_to_numeric.head()

In [None]:
cleaned_df_to_numeric.describe()

In [None]:
cleaned_df_to_numeric.shape

In [None]:
def boxplot(data):
  plt.figure(figsize=(14, 6))
  sns.boxplot(data)
  plt.show()

In [None]:
boxplot(cleaned_df_to_numeric["length"])

In [None]:
len(cleaned_df_to_numeric[cleaned_df_to_numeric["length"] >= 280])

In [None]:
len(cleaned_df_to_numeric[cleaned_df_to_numeric["length"] <= 6])

In [None]:
len(cleaned_df_to_numeric[cleaned_df_to_numeric["length"] == 0 ])

In [None]:
## remvoing the rows from the dataset where

def remove_outliers(data, col):

  Q1      = data[col].quantile(0.25)
  Q3      = data[col].quantile(0.75)
  IQR     = Q3 - Q1

  lower_bound = Q1 -1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound) & data[col] == 0]

  return data


In [None]:
cleaned_numeric_data   = remove_outliers(cleaned_df_to_numeric, "length")

In [None]:
boxplot(cleaned_numeric_data["length"])

In [None]:
cleaned_numeric_data2   = remove_outliers(cleaned_numeric_data, "length")

In [None]:
boxplot(cleaned_numeric_data2["length"])

In [None]:
cleaned_numeric_data2.describe()

In [None]:
def padding(sentences, seq_length=max(cleaned_numeric_data2["length"])):
  features = np.zeros((len(sentences), seq_length), dtype=int)

  for i, row in enumerate(sentences):
    if len(row) > 0:
      features[i, -len(row):] = np.array(row)[:seq_length]

  return features

In [None]:
padded_trained_data = padding(cleaned_numeric_data2["inputs"])

In [None]:
padded_trained_data[0]

In [None]:
def np_label(label_data):
  labels = np.array(label_data)

  return labels

In [None]:
cleaned_numeric_data2.columns

In [None]:
train_np_labels = np_label(cleaned_numeric_data2['PRODUCT_LENGTH'])

In [None]:
def tensor_dataset(data, labels):
  tensor_data = torch.from_numpy(data)
  tensor_labels = torch.from_numpy(labels)

  tensor_dataset = TensorDataset(tensor_data, tensor_labels)

  return tensor_dataset


In [None]:
def dataloader(dataset, batch_size):
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return dataloader

In [None]:
batch_size = 100
training_dataset = tensor_dataset(padded_trained_data, train_np_labels)
train_loader      = dataloader(training_dataset, batch_size=batch_size)

**Building A LSTM Model**

In [None]:
class LSTM_Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, embedding_dim, hidden_layers):
        super(LSTM_Model, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers

        # Embedding layer
        self.embedding_layer = nn.Embedding(self.input_dim, self.embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(self.embedding_dim,
                            self.hidden_dim,
                            self.hidden_layers,
                            batch_first=True,
                            dropout=0.3)

        # Fully connected layer | output layer
        self.output_layer = nn.Linear(self.hidden_dim, 1)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

    def forward(self, inputs, hidden_inputs, batch_size):
        # Pass inputs through the embedding layer
        embed_outputs = self.embedding_layer(inputs)

        # Pass embeddings through the LSTM layer
        lstm_output, hidden_outputs = self.lstm(embed_outputs, hidden_inputs)

        # Reshape LSTM output for the fully connected layer
        lstm_output_cont = lstm_output.contiguous().view(-1, self.hidden_dim)  # Fix typo here

        # Apply dropout
        output = self.dropout(lstm_output_cont)

        # Pass through the output layer (fully connected)
        output = self.output_layer(output)

        # Reshape the output to the correct shape (batch_size, -1)
        output = output.view(batch_size, -1)

        return output, hidden_outputs

In [None]:
input_dim = len(clenaed_vocabulary_) + 1
hidden_dim = 512
embedding_dim = 50
hidden_layers = 3

model = LSTM_Model(input_dim, hidden_dim, embedding_dim, hidden_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fun = nn.MSELoss()

In [None]:
def calculate_r2_score(actual_labels, pred_labels):
    # Convert to float for numerical stability
    actual_labels = actual_labels.float()
    pred_labels = pred_labels.float()

    # Calculate the mean of actual labels
    mean_actual = torch.mean(actual_labels)

    # Sum of Squared Errors (SSE)
    sse = torch.sum((actual_labels - pred_labels) ** 2)

    # Total Sum of Squares (TSS)
    tss = torch.sum((actual_labels - mean_actual) ** 2)

    # Calculate R² score
    r2_score = 1 - sse / tss

    return r2_score

In [None]:
def train_model(model, loss_fun, optimizer, train_loader, nepochs, hidden_layers, batch_size, hidden_dim):
    losses = np.zeros(nepochs)
    r2_scores = np.zeros(nepochs)

    for epoch in range(nepochs):
        batch_losses = []
        batch_scores = []

        model.train()

        for inputs, labels in iter(train_loader):
            # Initialize the hidden state (h0, c0) for each batch
            h0 = torch.zeros(hidden_layers, batch_size, hidden_dim)
            c0 = torch.zeros(hidden_layers, batch_size, hidden_dim)
            h = (h0, c0)

            # Forward pass through the model
            y_hat, h_c_output = model(inputs, h, batch_size)

            # Zero the gradients
            optimizer.zero_grad()

            # Calculate loss and backpropagate
            loss = loss_fun(y_hat, labels.float())  #  y_hat.squeeze() might cause issues if the output tensor has a single dimension
            loss.backward()
            batch_losses.append(loss.item())

            # Update model parameters
            optimizer.step()

            # Calculate R² score for this batch
            r_score = calculate_r2_score(labels, y_hat)
            batch_scores.append(r_score.item())  # Ensure it's converted to a Python scalar

        # Average losses and R² scores across all batches for this epoch
        losses[epoch] = np.mean(batch_losses)
        r2_scores[epoch] = np.mean(batch_scores)

    return losses, r2_scores


In [None]:
nepochs = 50
losses, r_score = train_model(model, loss_fun, optimizer, train_loader, nepochs, hidden_layers, batch_size, hidden_dim)

In [None]:
first_batch = next(iter(train_loader))
inputs, labels = first_batch

print(inputs)
print(labels)


In [None]:
??nn.LSTM

In [95]:
# !pip install --upgrade torch


Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nv