In [None]:
import pandas as pd
import csv
import numpy as np

# Load Data

In [None]:
train = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/train.csv",escapechar="\\", quoting = csv.QUOTE_NONE)

In [None]:
train.head()

# Get all value counts

In [None]:
train["BROWSE_NODE_ID"].value_counts()

In [None]:
train["BRAND"].value_counts()

In [None]:
train["TITLE"].value_counts()

In [None]:
train["DESCRIPTION"].value_counts()

In [None]:
train.loc[train['DESCRIPTION'] == "NH10 DESIGNS Presents 3D HD Quality Hard Shell Back Covers provides protection to your phone from dust and unnecessary scratches. All designs come in HD and waterproof ink promising uncompromised quality. It has very Precise Cutting of Charging, Mic, Speaker & Headphone Jack etc. NH10 DESIGNS Covers are designer covers which comes in very little quantity almost 5-10 pcs so hurry up.The High Quality Back Cover Protects your phone from Scratches and Bumps. This back cover wraps around the back side of the mobile adding an extra layer of protection over your entire device.Our all covers comes with a 6 months printing warranty(Conditions apply).For any enquiry or customise orders."]

In [None]:
train.loc[train['TITLE'] == "Allen Solly Men's Slim fit Casual Shirt"]

# Check if all columns start with brand name

In [None]:
train["brand_in_title"]=train.apply(lambda row : str(row.TITLE).startswith(str(row.BRAND)), axis=1)
train["brand_in_title"].value_counts()

In [None]:
train["brand_in_description"]=train.apply(lambda row : str(row.DESCRIPTION).startswith(str(row.BRAND)), axis=1)
train["brand_in_description"].value_counts()

In [None]:
train["brand_in_bullets"]=train.apply(lambda row : str(row.BULLET_POINTS).startswith(str(row.BRAND)), axis=1)
train["brand_in_bullets"].value_counts()

# Get brand Name as a single word

In [None]:
def get_near(s):
  l=str(s).split()
  k=""
  for i in l:
    k=k+i
  return k;

In [None]:
train["BRAND_NAME"]=train.apply(lambda row: get_near(row.BRAND), axis=1)

In [None]:
train["BRAND_NAME"].value_counts()

# Replace all Brand Names in all columns

In [None]:
train["TITLE"]=train.apply(lambda row: str(row.TITLE).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
train["DESCRIPTION"]=train.apply(lambda row: str(row.DESCRIPTION).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
train["BULLET_POINTS"]=train.apply(lambda row: str(row.BULLET_POINTS).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
train = train[["TITLE","DESCRIPTION","BULLET_POINTS","BRAND_NAME","BROWSE_NODE_ID"]]

# Download all NLTK needed and stop words
If in stop words remove it If word not in brand name then lemmatize it or lemmatize all words

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download("wordnet")

In [None]:
def preprocess(s):
  s=str(s).lower()
  l=[]
  for k in s.split():
    if k not in stop_words:
      l.append(lemmatizer.lemmatize(k))
  return " ".join(l)

In [None]:
train["TITLE"]=train.apply(lambda x : preprocess(x.TITLE), axis=1)
train["DESCRIPTION"]=train.apply(lambda x : preprocess(x.DESCRIPTION), axis=1)
train["BULLET_POINTS"]=train.apply(lambda x : preprocess(x.BULLET_POINTS), axis=1)

In [None]:
train.head()

# Creating Input

In [None]:
train["INFO"]=train.apply(lambda row: str(row.TITLE)+" "+str(row.DESCRIPTION)+" "+str(row.BULLET_POINTS), axis =1)
train=train[["INFO","BROWSE_NODE_ID"]]

In [None]:
train.head()

# Model Building and Label Encoder

In [None]:
documents=[text.split() for text in train.INFO]

In [None]:
import gensim

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(window=4, 
                                            min_count=4, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=50)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.INFO)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
train["INFO"] = pad_sequences(tokenizer.texts_to_sequences(train["INFO"]), maxlen=SEQUENCE_LENGTH)

In [None]:
encoder = LabelEncoder()
encoder.fit(train.INFO.tolist())

In [None]:
train["BROWSE_NODE_ID"]=encoder.transform(train.BROWSE_NODE_ID.tolist())

In [None]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
del w2v_model

In [None]:
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='softmax'))

model.summary()

In [None]:
del embedding_layer

In [None]:
del documents

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [None]:
history = model.fit(train.INFO, train.BROWSE_NODE_ID,
                    batch_size=1703,
                    100,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
del train

# Evaluate

In [None]:
test = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/test.csv",escapechar="\\", quoting = csv.QUOTE_NONE)

In [None]:
test.head()

In [None]:
test["BRAND_NAME"]=train.apply(lambda row: get_near(row.BRAND), axis=1)

In [None]:
test["TITLE"]=test.apply(lambda row: str(row.TITLE).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
test["DESCRIPTION"]=test.apply(lambda row: str(row.DESCRIPTION).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
test["BULLET_POINTS"]=test.apply(lambda row: str(row.BULLET_POINTS).replace(str(row.BRAND),str(row.BRAND_NAME)), axis =1)

In [None]:
test = test[["PRODUCT_ID","TITLE","DESCRIPTION","BULLET_POINTS","BRAND_NAME"]]

In [None]:
test["TITLE"]=test.apply(lambda x : preprocess(x.TITLE), axis=1)
test["DESCRIPTION"]=test.apply(lambda x : preprocess(x.DESCRIPTION), axis=1)
test["BULLET_POINTS"]=test.apply(lambda x : preprocess(x.BULLET_POINTS), axis=1)

In [None]:
test.head()

In [None]:
test["INFO"]=test.apply(lambda row: str(row.TITLE)+" "+str(row.DESCRIPTION)+" "+str(row.BULLET_POINTS), axis = 1)
test=test[["PRODUCT_ID","INFO"]]

In [None]:
test.head()

In [None]:
test["INFO"] = pad_sequences(tokenizer.texts_to_sequences(test["INFO"]), maxlen=SEQUENCE_LENGTH)

In [None]:
test.BROWSE_NODE_ID = encoder.transform(test.BROWSE_NODE_ID.tolist())

In [None]:
test.BROWSE_NODE_ID = test.BROWSE_NODE_ID.reshape(-1,1)

In [None]:
score = model.evaluate(test.INFO, test.BROWSE_NODE_ID, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
submission = pd.DataFrame(columns = ["PRODUCT_ID","BROWSE_NODE_ID"])

In [None]:
def predictor(row):
  ans = model.predict(row.INFO)
  ans = np.argmax(ans, axis=1)
  submission.append({"PRODUCT_ID":row.PRODUCT_ID, "BROWSE_NODE_ID": ans},ignore_index=False) 

In [None]:
test.apply(lambda row: predictor(row))

In [None]:
submission.to_csv("./submission.csv")

I could not complete training and testing due to lack of resourses. Let me know if anyone who has enough resources about the results.
Thank you