# NLP model to match product description to the right product HScode
This model all about matching product descriptions to the correct Harmonized System(HS) code.


# Importing Necessary Libraries

In [2]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Importing the Dataset

In [3]:
data=pd.read_csv('Data\product_hs_code_data.csv')
data.head()

Unnamed: 0,description,hs_code
0,"blue watch, size small, battery AA",910211
1,"blue bike, size small, model hybrid",871190
2,"blue clothes, size S, fabric polyester",620293
3,"blue clothes, size M, fabric wool",620293
4,"red car model 2023, size truck",870320


# Data Pre-Processing

In [4]:
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
corpus_lemmatization=[]

for i in range(len(data)):
    review=re.sub('[^a-zA-Z]',' ',data['description'][i])
    #review=review.lower()
    review=review.split()

    review=[wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
    review=' '.join(review)
    corpus_lemmatization.append(review)

In [10]:
print(corpus_lemmatization)

['blue watch size small battery AA', 'blue bike size small model hybrid', 'blue clothes size S fabric polyester', 'blue clothes size M fabric wool', 'red car model size truck', 'black watch size medium battery rechargeable', 'blue bike size medium model road', 'blue watch size small battery AAA', 'silver watch size medium battery rechargeable', 'red bike size large model road', 'black bike size medium model road', 'blue watch size medium battery rechargeable', 'red car model size SUV', 'blue car model size truck', 'black mobile size inch battery mAh compatibility G', 'red mobile size inch battery mAh compatibility G', 'green clothes size XL fabric cotton', 'white mobile size inch battery mAh compatibility G', 'green clothes size L fabric polyester', 'green clothes size M fabric wool', 'red mobile size inch battery mAh compatibility G', 'black clothes size S fabric wool', 'red bike size large model mountain', 'green bike size medium model hybrid', 'green clothes size L fabric cotton', '

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus_lemmatization).toarray()

In [6]:
y=data.hs_code

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [8]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [9]:
y_pred=model.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix
c_m=confusion_matrix(y_test,y_pred)
print("confusion matrix:\n", c_m)

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy score:\n",accuracy)

confusion matrix:
 [[123   0   0   0   0]
 [  0 113   0   0   0]
 [  0   0 123   0   0]
 [  0   0   0 123   0]
 [  0   0   0   0 118]]
Accuracy score:
 1.0


In [18]:
import joblib

# Save the trained model
joblib.dump(model, 'multinomial_nb_model.pkl')

# Save the fitted TfidfVectorizer
joblib.dump(cv, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [19]:
# Load the trained model and vectorizer
loaded_model = joblib.load('multinomial_nb_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [22]:

new_sentence = "i need a device to check my time which is small and have good battery"


new_sentence = re.sub('[^a-zA-Z]', ' ', new_sentence)
new_sentence = new_sentence.split()

wordnet = WordNetLemmatizer()
new_sentence = [wordnet.lemmatize(word) for word in new_sentence if not word in set(stopwords.words('english'))]
new_sentence = ' '.join(new_sentence)

new_sentence_transformed = loaded_vectorizer.transform([new_sentence]).toarray()

prediction = loaded_model.predict(new_sentence_transformed)

print("Predicted HS code:", prediction[0])


Predicted HS code: 910211
