# NLP model to match product description to the right product HScode
This model all about matching product descriptions to the correct Harmonized System(HS) code.


# Importing Necessary Libraries

In [6]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Importing the Dataset
- The dataset consists of the various product descriptions of the car,bike and watch.
- It is the custom created dataset created for this task.

In [23]:
data=pd.read_csv('Data\data.csv')
print(data)

                                   product description  Hscode
0    A sleek black phone featuring a 6.5-inch displ...  851712
1    Compact yet powerful, this white mobile boasts...  851712
2    Experience lightning-fast performance with thi...  851712
3    Experience lightning-fast performance with thi...  851712
4    Designed for the on-the-go professional, this ...  851712
..                                                 ...     ...
895  This stylish red cruiser bike includes wide ti...  871190
896  With a modern white design and a 27-speed gear...  871190
897  The sleek black folding bike features a compac...  871190
898  This vibrant blue road bike includes a lightwe...  871190
899  With a stylish green hybrid bike and front sus...  871190

[900 rows x 2 columns]


# Data Pre-Processing
- Removing unnecessary characters 
- lowering the data
- removing the stopwords
- creating a corpus bby using lemmatization

In [24]:
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
corpus_lemmatization=[]

for i in range(len(data)):
    review=re.sub('[^a-zA-Z]',' ',data['product description'][i])
    review=review.lower()
    review=review.split()

    review=[wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
    review=' '.join(review)
    corpus_lemmatization.append(review)

In [25]:
print(corpus_lemmatization)

['sleek black phone featuring inch display powerful mah battery perfect gaming binge watching favorite show', 'compact yet powerful white mobile boast g connectivity crystal clear inch screen ideal video call social medium', 'experience lightning fast performance blue inch mobile equipped quad camera system fast charging capability', 'experience lightning fast performance blue inch mobile equipped quad camera system fast charging capability', 'designed go professional green mobile offer dual sim support inch display face recognition added security', 'black mobile phone inch display fingerprint sensor perfect capturing life moment quad camera setup', 'stylish white phone mah battery ideal work play featuring wireless charging water resistant design', 'redefine mobile experience blue device featuring high resolution screen fast charging seamless multitasking capability', 'ultimate mobile streaming enthusiast red inch phone come mah battery stereo speaker immersive audio', 'rugged mobile 

# TF-IDF 
- to retrieve the information.
- words are tranformed into vectors by text vectorization process.

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus_lemmatization).toarray()

In [27]:
y=data.Hscode

# Splitting the data

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

# Multinomial NaiveBayes model

In [29]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [30]:
y_pred=model.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
c_m=confusion_matrix(y_test,y_pred)
print("confusion matrix:\n", c_m)

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy score:\n",accuracy)

confusion matrix:
 [[59  0  0]
 [ 0 64  0]
 [ 0  0 57]]
Accuracy score:
 1.0


# Saving the Models 

In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'multinomial_nb_model.pkl')

# Save the fitted TfidfVectorizer
joblib.dump(cv, 'tfidf_vectorizer.pkl')


In [2]:

loaded_model = joblib.load('multinomial_nb_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')


# Test the model with the product descriptions
The model is tested with the real amazon product descriptions and the predictions are correct and satisfying.

In [10]:

new_sentence = "Specifications: 123 kms range in single charge; 63 kmph top speed (Unlock 3 ride modes with TecPac: 2 Forward - Eco and Sports and 1 Reverse)"


new_sentence = re.sub('[^a-zA-Z]', ' ', new_sentence)
new_sentence = new_sentence.split()

wordnet = WordNetLemmatizer()
new_sentence = [wordnet.lemmatize(word) for word in new_sentence if not word in set(stopwords.words('english'))]
new_sentence = ' '.join(new_sentence)

new_sentence_transformed = loaded_vectorizer.transform([new_sentence]).toarray()

prediction = loaded_model.predict(new_sentence_transformed)
#print(prediction)
prediction_prob=loaded_model.predict_proba(new_sentence_transformed)
#print(prediction_prob)
prediction_prob = prediction_prob.max()
#print(prediction_prob)

if prediction_prob <= 0.5:
    print("Cannot find the product you are searching for,the similar product that has similar features to your description is", prediction[0])
else:
    print(prediction[0])



871190
