In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading data 

In [5]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Product Length Prediction/train.csv')

df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [6]:
df.shape

(100000, 6)

## Removing rows with missing values 

In [7]:
# Remove rows where both 'BULLET_POINTS' and 'DESCRIPTION' are empty
df.dropna(subset=['BULLET_POINTS', 'DESCRIPTION'], how='all', inplace=True)

df.shape

(65196, 6)

## Convert data to tokens 

In [8]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 


# Define a function to preprocess and tokenize the text
def preprocess_text(text):
    text = str(text)
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if not token in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer() 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Return the preprocessed and tokenized text as a single string
    return ' '.join(lemmatized_tokens)


# Apply the preprocessing function to each column of text and store the union of tokens in a new column
df['TOKENS'] = df['TITLE'].apply(preprocess_text) + ' ' + df['BULLET_POINTS'].apply(preprocess_text) + ' ' + df['DESCRIPTION'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Using TF-IDF Encoding to convert tokens into vectors 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Perform TF-IDF encoding on the tokens column
tfidf = TfidfVectorizer(max_features=250)
x = tfidf.fit_transform(df['TOKENS'])


y = df['PRODUCT_LENGTH']

## Splitting into train and validation data 

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error


x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

## Defining the Neural Network 

In [11]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(
    min_delta=1, # minimium amount of change to count as an improvement
    patience=5, # how many epochs to wait before stopping
    restore_best_weights=True,
)

model = keras.Sequential([
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu', input_shape=[250]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(2048, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1),
])

# Compile the model
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

## Fitting the model 

In [12]:
# Train the model
model.fit(x_train.toarray(), y_train, 
          epochs=50, 
          batch_size=256,
          validation_data=(x_val.toarray(), y_val),
          callbacks=[early_stopping],
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


<keras.callbacks.History at 0x7fdc7c0a0ca0>

## Validating the model 

In [13]:
# Make predictions
y_pred = model.predict(x_val.toarray())

# Calculate the mean absolute percentage error on the validation set
mape = mean_absolute_percentage_error(y_val, y_pred)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Percentage Error: 0.8941021357038041


## Making predictions on test file 

In [16]:
test_df = pd.read_csv("/content/drive/MyDrive/Product Length Prediction/test.csv")

# Apply the preprocessing function to each column of text and store the union of tokens in a new column
test_df['TOKENS'] = test_df['TITLE'].apply(preprocess_text) + ' ' + test_df['BULLET_POINTS'].apply(preprocess_text) + ' ' + test_df['DESCRIPTION'].apply(preprocess_text)

# Perform TF-IDF encoding on the tokens column
tfidf = TfidfVectorizer(max_features=250)
x_test = tfidf.fit_transform(test_df['TOKENS'])

y_test = test_df['PRODUCT_LENGTH']

y_test_pred = model.predict(x_test.toarray())

# Calculate the mean absolute percentage error on the test set
mape = mean_absolute_percentage_error(y_test, y_test_pred)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Percentage Error: 1.0431448745102752
