In [2]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("train.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 103.0+ MB


In [4]:
#2 million data is a lot for my pc to handle
#reducing to 100,000

#first check for na values
train_data.isna().sum()

PRODUCT_ID               0
TITLE                   13
BULLET_POINTS       837366
DESCRIPTION        1157382
PRODUCT_TYPE_ID          0
PRODUCT_LENGTH           0
dtype: int64

In [5]:
#dropping all of them
train_data = train_data.dropna()
train_data.isna().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1038458 entries, 2 to 2249697
Data columns (total 6 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   PRODUCT_ID       1038458 non-null  int64  
 1   TITLE            1038458 non-null  object 
 2   BULLET_POINTS    1038458 non-null  object 
 3   DESCRIPTION      1038458 non-null  object 
 4   PRODUCT_TYPE_ID  1038458 non-null  int64  
 5   PRODUCT_LENGTH   1038458 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 55.5+ MB


In [7]:
#1million is still very big
train_data=train_data.iloc[:100000,:]

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 2 to 216991
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   PRODUCT_ID       100000 non-null  int64  
 1   TITLE            100000 non-null  object 
 2   BULLET_POINTS    100000 non-null  object 
 3   DESCRIPTION      100000 non-null  object 
 4   PRODUCT_TYPE_ID  100000 non-null  int64  
 5   PRODUCT_LENGTH   100000 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 5.3+ MB


In [9]:
train_data.shape
#400,000 values is feasible

(100000, 6)

In [3]:
#lets tokenize and convert to vectors!
#creating a fxn is betterr
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#we will remove stopwords and also lemmatize them (convert to root form)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def tokenize(x):
    #lower case
    x = str(x).lower()
    #punctuation 
    x = x.translate(str.maketrans('', '', string.punctuation))
    
    #tokenize
    tokens = word_tokenize(x)

    #stop words
    st = set(stopwords.words('english'))
    tk = [t for t in tokens if not t in st]

    #lemmatize
    le = WordNetLemmatizer()
    le_tk = [le.lemmatize(token) for token in tk]

    return ' '.join(le_tk)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
#lets try 
train_data["text"] = train_data['TITLE'].apply(tokenize) + ' ' + train_data['DESCRIPTION'].apply(tokenize) + ' ' + train_data['BULLET_POINTS'].apply(tokenize)

In [12]:
#lets see
train_data["text"]

2         priknik horn red electric air horn compressor ...
3         alishah woman cotton ankle length legging comb...
5         hin metal bucket shape plant pot indoor outdoo...
7         delavala self adhesive kitchen backsplash wall...
9         hexwell essential oil home fragrance oil aroma...
                                ...                        
216983    skechers woman max cushioning elite sneaker wh...
216985    camel safety match box stick 1 packet 10 piece...
216987    keyboard cover dell km636 wirelessdell kb216 w...
216989    fast charger oppo a1k a1 k 1 k charger adapter...
216991    coversgap samsung galaxy m31s back cover pink ...
Name: text, Length: 100000, dtype: object

In [13]:
#we drop the remaining columns
train_data = train_data.drop(columns=["TITLE","DESCRIPTION","BULLET_POINTS"])
train_data.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,text
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah woman cotton ankle length legging comb...
5,2152929,5725,950.0,hin metal bucket shape plant pot indoor outdoo...
7,2026580,6030,984.251967,delavala self adhesive kitchen backsplash wall...
9,2998633,8201,393.700787,hexwell essential oil home fragrance oil aroma...


In [14]:
#saving
train_data.to_csv("FINAL_CLEANED.csv",index=False)

In [15]:
#perfecto!
#now we convert our tokenized text into vectors using our tfidf vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
#if we dont set this max features, there will be a ton of features due to the big data size
X = tfidf.fit_transform(train_data['text'])

In [16]:
#target
y = train_data["PRODUCT_LENGTH"]

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
#we will use tensorflow for the particular problem

import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(), #cos normalisation is just better
    tf.keras.layers.Dense(500,activation='relu',input_shape=[250]),
    tf.keras.layers.Dense(500,activation="relu"),
    tf.keras.layers.Dense(500,activation="relu"),
    tf.keras.layers.Dense(300,activation="relu"),
    tf.keras.layers.Dense(300,activation="relu"),
    tf.keras.layers.Dense(300,activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1), #output layer
])

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss="mean_absolute_percentage_error",
              metrics=tf.keras.metrics.mean_absolute_percentage_error)

In [None]:
#we need to make X_train and X_test into an array
model.fit(X_train.toarray(),
          y_train,
          epochs=50,
          batch_size=32,
          validation_data=(X_test.toarray(),y_test,))

In [75]:
from sklearn.metrics import mean_absolute_percentage_error
import sklearn.metrics as metrics
y_pred = model.predict(X_test.toarray())
# score = mean_absolute_percentage_error(y_test,y_pred)
score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(y_test,y_pred)))
print(f"Mean Absolute Percentage Error is : {score}")

Mean Absolute Percentage Error is : 0


In [2]:
import pandas as pd
test_data = pd.read_csv("test.csv")

In [22]:
test_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


In [23]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734736 entries, 0 to 734735
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   PRODUCT_ID       734736 non-null  int64 
 1   TITLE            734731 non-null  object
 2   BULLET_POINTS    458810 non-null  object
 3   DESCRIPTION      354735 non-null  object
 4   PRODUCT_TYPE_ID  734736 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 28.0+ MB


In [4]:
#applying our token fxn 

test_data["text"] = test_data["TITLE"].apply(tokenize) + ' ' + test_data["BULLET_POINTS"].apply(tokenize) + ' ' +test_data["DESCRIPTION"].apply(tokenize)

In [25]:
test_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,text
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142,manuel dhéliogravure et de photogravure en rel...
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622,dcgaring microfiber throw blanket warm fuzzy p...
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540,imatch auto part front license plate bracket t...
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442,pinmart gold plated excellence service 1 year ...
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318,visual mathematics illustrated ti92 ti89 nan nan


In [5]:
test_data = test_data.drop(columns=["TITLE","DESCRIPTION","BULLET_POINTS"])
test_data.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,text
0,604373,6142,manuel dhéliogravure et de photogravure en rel...
1,1729783,1622,dcgaring microfiber throw blanket warm fuzzy p...
2,1871949,7540,imatch auto part front license plate bracket t...
3,1107571,12442,pinmart gold plated excellence service 1 year ...
4,624253,6318,visual mathematics illustrated ti92 ti89 nan nan


In [27]:
test_data.to_csv("FINAL_CLEANED_TEST.csv")

In [28]:
sample = pd.read_csv("sample_submission.csv")

In [29]:
sample

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,701.093794
1,1729783,734.506163
2,1871949,741.360258
3,1107571,730.327767
4,624253,666.847946
...,...,...
734731,921419,733.838809
734732,2456362,746.810825
734733,841529,691.127128
734734,1190194,757.643591


In [8]:
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

tf_for_test_data = TfidfVectorizer(max_features=250)
x_test_data = tf_for_test_data.fit_transform(test_data["text"])

In [11]:
# from keras.models import load_model
# model = load_model("mymodel.h5")

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
y_test_data_pred = model.predict(x_test_data.toarray())
# final_score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(y_test_data,y_test_data_pred)))
# final_score = mean_absolute_percentage_error(y_test_data,y_test_data_pred)
# print(f"MEAN ABSOLUTE PERCENTAGE ERROR: {final_score}")

In [59]:
df = pd.DataFrame(test_data["PRODUCT_ID"])

# Add a new column with the y_pred values
df["y_pred"] = y_test_data_pred

# Set the column names
df.columns = ["PRODUCT_ID", "PRODUCT_LENGTH"]

In [60]:
df.to_csv("final_submission.csv")