In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import contractions
from bs4 import BeautifulSoup
from sklearn.metrics import precision_recall_fscore_support
from itertools import chain
from nltk.corpus import stopwords

## Approx run time - 2s

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Approx run time - 1s

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
product_dets = pd.read_csv("promapen-train_data.csv")

## Approx run time - 1 min 10 s

In [4]:
product_dets['text1'] = product_dets['name1'] + " " + product_dets['short_description1'] + " " + product_dets['long_description1'] + " " + product_dets['specification1']

In [5]:
product_dets['text2'] = product_dets['name2'] + " " + product_dets['short_description2'] + " " + product_dets['long_description2'] + " " + product_dets['specification2']

In [6]:
def data_clean(s:str):
    s = BeautifulSoup(str(s)).get_text() #removing HTML and XML tags
    s = re.split(" |\,|;|\.|:",s) #splitting on basis of spaces, commas, colon, semi-colon and periods
    s = [contractions.fix(x).lower().strip().split() for x in s] #expanding contractions and changing text to lower case, removing extra spaces
    s = list(chain(*s)) #Flattening out the array

    #Keeping the Alpha Numeric characters and spaces
    s = (" ").join([re.sub(r'[^A-Za-z0-9\s]+', ' ', word) for word in s])

    #removing extra spaces
    s = (" ").join(s.split()).strip()

    return s

In [7]:
product_dets['text1'] = product_dets['text1'].apply(data_clean)
product_dets['text2'] = product_dets['text2'].apply(data_clean)

  s = BeautifulSoup(str(s)).get_text() #removing HTML and XML tags


In [8]:
product_dets['text1'].apply(str).apply(len).mean()

1601.1093247588424

In [9]:
product_dets['text2'].apply(str).apply(len).mean()


1335.8609324758843

In [10]:
def remove_stop(s:str):
    stopw = set(stopwords.words("english")) #creating set of stop words from NLTK
    tokens = nltk.tokenize.word_tokenize(s) # generating tokens
    tokens = [x for x in tokens if x not in stopw] # removing the stop words
    return tokens

In [11]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()


def custom_lemmatize(s:list):
    pos_tagged = nltk.pos_tag(s) #pos tagging with nltk
    #lemmatizing by using the pos tags
    tokens = [wnl.lemmatize(x[0]) for x in pos_tagged] 
    return " ".join(tokens) #returning in sentence format

In [12]:
product_dets['text1'] = product_dets['text1'].apply(remove_stop)
product_dets['text2'] = product_dets['text2'].apply(remove_stop)

In [13]:
product_dets['text1'] = product_dets['text1'].apply(custom_lemmatize)
product_dets['text2'] = product_dets['text2'].apply(custom_lemmatize)

In [14]:
product_dets['text1'].head()

0    bagcraft p057012 12 x 12 grease resistant pape...
1    clorox 35420 128 oz clean disinfectant cleaner...
2    clorox 35420 128 oz clean disinfectant cleaner...
3    2 pack lysol disinfecting wipe lemon lime blos...
4    2 pack lysol disinfecting wipe lemon lime blos...
Name: text1, dtype: object

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer1 = TfidfVectorizer(stop_words='english')

In [16]:
# TF-IDF feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
"""
    1. reducing the size to float 32 to avoid memory issues - dtype = float32
    2. using ngram_range to consider 1 to 4 words together while extracting the features
"""
tfidf = TfidfVectorizer(dtype = np.float32, min_df=2,ngram_range=(1,4))
tfidf = tfidf.fit(product_dets['text1'])
tfidf = tfidf.fit(product_dets['text2'])

tfidf_vectors_text1 = tfidf.transform(product_dets['text1'])
tfidf_vectors_text2 = tfidf.transform(product_dets['text2'])
## Approx run time - 30s

In [17]:
tfidf_vectors_text2[0]

<1x74689 sparse matrix of type '<class 'numpy.float32'>'
	with 164 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_vectors_text1[0]

<1x74689 sparse matrix of type '<class 'numpy.float32'>'
	with 77 stored elements in Compressed Sparse Row format>

In [116]:
np.dot(tfidf_vectors_text2[0].toarray(),tfidf_vectors_text1[0].toarray().T)[0][0]

0.1880286

In [117]:
np.dot(tfidf_vectors_text2[0].toarray(),tfidf_vectors_text1[9].toarray().T)[0][0]

0.09403747

In [110]:
tfidf_vectors_text2[0].toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [20]:
from scipy.spatial import distance

distance.cosine(tfidf_vectors_text2[0].toarray()[0],tfidf_vectors_text1[3].toarray()[0])

0.9718383252620697

In [59]:
tfidf_vectors_text1.shape

(1244, 74689)

In [102]:
top1 = 0
top5 = 0
for i in range(tfidf_vectors_text1.shape[0]):
    list_dist = np.array([])
    for j in range(tfidf_vectors_text2.shape[0]):
        val = np.dot(tfidf_vectors_text1[i].toarray(),tfidf_vectors_text2[j].toarray().T)[0][0]
        list_dist =np.append(list_dist,val)

    maxI = list_dist.argsort()[::-1][:10]

    if maxI[0] == i:
        top1+=1
        top5+=1
    elif i in maxI:
        top5+=1

In [21]:
top1 = 0
top5 = 0
for i in range(tfidf_vectors_text1.shape[0]):
    list_dist = np.array([])
    for j in range(tfidf_vectors_text2.shape[0]):
        val = distance.cosine(tfidf_vectors_text1[i].toarray()[0],tfidf_vectors_text2[j].toarray()[0])
        list_dist =np.append(list_dist,val)

    maxI = list_dist.argsort()[:10]

    if maxI[0] == i:
        top1+=1
        top5+=1
    elif i in maxI:
        top5+=1

: 

In [103]:
l = tfidf_vectors_text1.shape[0]

print("Correct match - "+ str(top1/l))
print("Top  match 5 - "+ str(top5/l))

Correct match - 0.3215434083601286
Top 5 match - 0.6583601286173634


In [84]:
list_dist.argsort()[::-1][:5]

array([1186,  895,    0,   59,   58], dtype=int64)