In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error







In [2]:

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data


Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.980000
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.700000
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424000
...,...,...,...,...,...,...
2249693,2422167,Nike Women's As W Ny Df Swsh Hn Kh Bra (CZ7610...,Material : Polyester,,3009,1181.100000
2249694,2766635,"(3PCS) Goose Game Cute Cartoon Enamel Pins, Fu...",[❤ [Inspiration] Inspired by the Untitled Goos...,<p><b>[Brand]: </b>XVIEONR</p> <p><br></p> <p>...,3413,125.984252
2249695,1987786,Kangroo Sweep Movement Printed Wooden Wall Clo...,"[Dial size: 12 inches in diameter,Big, clear r...",Wall Clocks Are Very Attractive In Looks And E...,1574,1200.000000
2249696,1165754,Electro Voice EKX-BRKT15 | Wall Mount Bracket ...,,,592,2900.000000


In [None]:
import nltk


import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    if not isinstance(text, str):  # Check if the input is a string
        return ''
    
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back into a single string
    text = ' '.join(words)
    return text



# Preprocess the text data
train_data['TITLE'] = train_data['TITLE'].apply(preprocess_text)
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(preprocess_text)
train_data['BULLET_POINTS'] = train_data['BULLET_POINTS'].apply(preprocess_text)

test_data['TITLE'] = test_data['TITLE'].apply(preprocess_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)
test_data['BULLET_POINTS'] = test_data['BULLET_POINTS'].apply(preprocess_text)

train_data.head(10)
test_data.head(10)



In [None]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['TITLE'])

X_train_seq = tokenizer.texts_to_sequences(train_data['TITLE'])
X_test_seq = tokenizer.texts_to_sequences(test_data['TITLE'])


maxlen = 100
X_train = pad_sequences(X_train_seq, maxlen=maxlen)
X_test = pad_sequences(X_test_seq, maxlen=maxlen)
y_train = train_data['PRODUCT_LENGTH']

In [None]:

# Build and train the TensorFlow model
model = Sequential([
    Embedding(max_features, 128, input_length=maxlen),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1)
])

model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

history = model.fit(X_train_split, y_train_split, batch_size=32, epochs=10, validation_data=(X_val_split, y_val_split))

In [None]:
# Model evaluation
y_val_pred = model.predict(X_val_split).flatten()
mape = mean_absolute_percentage_error(y_val_split, y_val_pred)
score = max(0, 100 * (1 - mape))
print("Score:", score)

# Model prediction and submission
y_test_pred = model.predict(X_test).flatten()

submission = pd.DataFrame({'PRODUCT_ID': test_data['PRODUCT_ID'], 'PRODUCT_LENGTH': y_test_pred})
submission.to_csv('submission.csv', index=False)