# Feaure Engineering - Text Processing

* Process the textual data
* Combine the hash similarities of the images (Feature-Engineering-Part-1)
* Save the features to CSV files in the "features/ProMapEn/" path


## 1. Modules and Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import contractions

import nltk
nltk.download('words')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Average runtime - 1s

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Load data

* Combined the train and test for feature extraction

In [2]:
promapen_train = pd.read_csv("datasets\\ProMapEn\\promapen-train_data.csv")
promapen_test = pd.read_csv("datasets\\ProMapEn\\promapen-test_data.csv")

promapen = pd.concat([promapen_train, promapen_test], ignore_index=True)
promapen = promapen.fillna(" ")

print(promapen.shape)
promapen.head(3)

## Average runtime - 1s

(1555, 21)


Unnamed: 0,name1,short_description1,long_description1,specification1,image1,price1,id1,name2,short_description2,long_description2,...,image2,price2,id2,match,image_url1,image_url2,category,match_type,specification_text1,specification_text2
0,Bagcraft P057012 12 x 12 Grease-Resistant Pape...,"Excellent low-cost, low-waste alternative to p...","Wrap/liner is an excellent low-cost, low-waste...","[{""key"": ""Features"", ""value"": ""Excellent low-c...",3,131.59,https://walmart.com/ip/Bagcraft-P057012-12-x-1...,Bagcraft Papercon 012008 Interfolded Heavy Dry...,"Provides wet strength, improved moisture resis...",Bagcraft interfolded heavy dry wax deli paper....,...,1,135.1,https://www.amazon.com/dp/B00C7KTHHI,0,"[""https://i5.walmartimages.com/asr/8f9b23a7-f4...","[""https://m.media-amazon.com/images/I/51VDhs3N...",6_household,medium_nonmatch,"Features Excellent low-cost, low-waste alterna...",Brand Name Bagcraft Papercon Global Trade Iden...
1,Clorox 35420 128 oz. Clean-Up Disinfectant Cle...,Removes stains and disinfects to kill 99.9% of...,Clorox Clean-Up CloroxPro Disinfectant Cleaner...,"[{""key"": ""Assembled Product Weight"", ""value"": ...",5,61.38,https://walmart.com/ip/Clorox-35420-128-oz-Cle...,CloroxPro Anywhere Daily Disinfectant and Sani...,NO-RINSE FOOD CONTACT SANITIZER: Confidently s...,CloroxPro Anywhere Daily Disinfectant and Sani...,...,1,,https://www.amazon.com/dp/B07FQRB2XV,0,"[""https://i5.walmartimages.com/asr/3336afe6-d5...","[""https://m.media-amazon.com/images/I/71f6nNyY...",6_household,close_nonmatch,Assembled Product Weight 37.4 lb Brand Clorox ...,
2,Clorox 35420 128 oz. Clean-Up Disinfectant Cle...,Removes stains and disinfects to kill 99.9% of...,Clorox Clean-Up CloroxPro Disinfectant Cleaner...,"[{""key"": ""Assembled Product Weight"", ""value"": ...",5,61.38,https://walmart.com/ip/Clorox-35420-128-oz-Cle...,CLOROXPRO Commercial Solutions CLOROXPRO Clean...,DISINFECTANT SPRAY: Use this Clorox Clean-Up D...,Clorox Clean-Up Disinfectant Cleaner with Blea...,...,1,83.6,https://www.amazon.com/dp/B004EHZ7GW,0,"[""https://i5.walmartimages.com/asr/3336afe6-d5...","[""https://m.media-amazon.com/images/I/81+djgUF...",6_household,medium_nonmatch,Assembled Product Weight 37.4 lb Brand Clorox ...,


## 3. Data Cleaning

In [3]:
class Preprocess:
    """Preprocess the text"""
    
    def __init__(self, dataframe=None):
        """
        Preprocess the data to extract features
        Args:
            dataframe (pd.Dataframe): data to be processed 
        """
        self.dataframe = dataframe
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = stopwords.words('english')
    
    def process_brand_names(self, column):
        """
        Extract the brand names from the specified column
        Args:
            column (str): name of the column
        
        Return:
            list: brand names
        """
        return [
            item["value"].lower()
            for row in self.dataframe[column]
            for item in eval(row)
            if item["key"]=="Brand" 
        ]
    
    def process_text_column(self, column):
        """
        Process the text from the specified column
        Args:
            column (str): name of the column
        
        Return:
            list: cleaned data
        """
        
        processed_values = []
        
        for text in self.dataframe[column]:
            text = contractions.fix(text).lower() 
            text = re.sub(r"[^\w\s]", " ", text) # remove useless characters
            text = re.sub(r"(\d)([A-Za-z])", r"\1 \2", text) # separate units and values

            words = [word.strip() for word in word_tokenize(text)] 
            words = [word for word in words if word not in self.stopwords] # remove stopwords
            words = [self.lemmatizer.lemmatize(word.lower()) for word in words] # lemmatize
            
            processed_values.append(" ".join(words))
        
        return processed_values

    def process_specification(self, column):
        """
        Process the specification column of the products
        Args:
            column (str): name of the column
            
        Return:
            list: string format after evaluating the row
        """
        return [
            ' '.join([f"{item['key']} {item['value']}" for item in eval(row)])
            for row in self.dataframe[column]
        ]

## Average runtime - 1s

### 3.1 Process the dataframe

* Store only the necessary columns 

In [4]:
text_processor = Preprocess(promapen)
processed_df = pd.DataFrame()

# process name, short description and long description
processed_df["name1"] = text_processor.process_text_column("name1")
processed_df["short_description1"] = text_processor.process_text_column("short_description1")
processed_df["long_description1"] = text_processor.process_text_column("long_description1")

processed_df["name2"] = text_processor.process_text_column("name2")
processed_df["short_description2"] = text_processor.process_text_column("short_description2")
processed_df["long_description2"] = text_processor.process_text_column("long_description2")

# process the specifications column
processed_df["specification1"] = text_processor.process_specification("specification1")
processed_df["specification2"] = text_processor.process_specification("specification2")

# add all_texts column by combining values from name, short description and long description
processed_df["all_texts1"] = processed_df.apply(lambda row: " ".join([row['name1'], row['short_description1'], row["long_description2"], row["specification1"]]), axis=1)
processed_df["all_texts2"] = processed_df.apply(lambda row: " ".join([row['name2'], row['short_description2'], row["long_description2"], row["specification2"]]), axis=1)

# add original specifications
processed_df["orig_specification1"] = promapen["specification1"]
processed_df["orig_specification2"] = promapen["specification2"]

# attach the match_type column
processed_df["match"] = promapen["match"]

## Average runtime - 8s

In [5]:
print(processed_df.shape)
processed_df.head(3)

## Average runtime - 1s

(1555, 13)


Unnamed: 0,name1,short_description1,long_description1,name2,short_description2,long_description2,specification1,specification2,all_texts1,all_texts2,orig_specification1,orig_specification2,match
0,bagcraft p057012 12 x 12 grease resistant pape...,excellent low cost low waste alternative paper...,wrap liner excellent low cost low waste altern...,bagcraft papercon 012008 interfolded heavy dry...,provides wet strength improved moisture resist...,bagcraft interfolded heavy dry wax deli paper ...,"Features Excellent low-cost, low-waste alterna...",Brand Name Bagcraft Papercon Global Trade Iden...,bagcraft p057012 12 x 12 grease resistant pape...,bagcraft papercon 012008 interfolded heavy dry...,"[{""key"": ""Features"", ""value"": ""Excellent low-c...","[{""key"": ""Brand Name"", ""value"": ""Bagcraft Pape...",0
1,clorox 35420 128 oz clean disinfectant cleaner...,remove stain disinfects kill 99 9 virus bacter...,clorox clean cloroxpro disinfectant cleaner bl...,cloroxpro anywhere daily disinfectant sanitizi...,rinse food contact sanitizer confidently sanit...,cloroxpro anywhere daily disinfectant sanitizi...,Assembled Product Weight 37.4 lb Brand Clorox ...,,clorox 35420 128 oz clean disinfectant cleaner...,cloroxpro anywhere daily disinfectant sanitizi...,"[{""key"": ""Assembled Product Weight"", ""value"": ...",[],0
2,clorox 35420 128 oz clean disinfectant cleaner...,remove stain disinfects kill 99 9 virus bacter...,clorox clean cloroxpro disinfectant cleaner bl...,cloroxpro commercial solution cloroxpro clean ...,disinfectant spray use clorox clean disinfecta...,clorox clean disinfectant cleaner bleach power...,Assembled Product Weight 37.4 lb Brand Clorox ...,,clorox 35420 128 oz clean disinfectant cleaner...,cloroxpro commercial solution cloroxpro clean ...,"[{""key"": ""Assembled Product Weight"", ""value"": ...",[],0


In [6]:
processed_df.to_parquet('Processed.parquet')
## Read when required
# processed_df = pd.read_parquet('Processed.parquet')

## Average runtime - 1s

## 4. Feature Extraction

### 4.1 Text similarity computations

* Calculate cosine similarity between the textual information of 2 products
* Extracted 4 features: name_cos, short_description_cos, long_description_cos, all_texts_cos

In [7]:
class TFIDF_vectorizer:
    
    
    def __init__(self,dataframe_column) -> None:
        """
        Fitting the columns into TfidfVectorizer
        Args:
            series (pd.Series): data to be fitted 
        """
        self.vectorizer = TfidfVectorizer().fit(dataframe_column)
    
    def transformer(self, text1, text2) -> list:
        """
        Transform the text into tfidf vectors
        Args:
            text1 (str): column 1 data
            text2 (str): column 2 data
        """
        return [self.vectorizer.transform([text1]),self.vectorizer.transform([text2])]
    
    def calculate_cosine_similarity(self, text1, text2):
        """
        Calculates the cosine distance between the text vectors
        Args:
            text1 (str): column 1 data
            text2 (str): column 2 data
        """
        try:
            tfidf_matrix = self.transformer(text1, text2)
            return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        except:
            return 0

## Average runtime - 1s

In [8]:
similarity_class = TFIDF_vectorizer(pd.concat([processed_df['name1'],
                    processed_df['name2'],
                    processed_df['short_description1'],
                    processed_df['short_description2'],
                    processed_df['long_description1'],
                    processed_df['long_description2']],ignore_index=True))

features_df = pd.DataFrame()

# Calculate cosine similarity between name, short and long description columns of 2 products
features_df['name_cos'] = processed_df.apply(lambda row: similarity_class.calculate_cosine_similarity(row['name1'], row['name2']), axis=1)
features_df['short_description_cos'] = processed_df.apply(lambda row: similarity_class.calculate_cosine_similarity(row['short_description1'], row['short_description2']), axis=1)
features_df['long_description_cos'] = processed_df.apply(lambda row: similarity_class.calculate_cosine_similarity(row['long_description1'], row['long_description2']), axis=1)
features_df['all_texts_cos'] = processed_df.apply(lambda row: similarity_class.calculate_cosine_similarity(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 35s

## 4.2 Keyword Detection and Similarity Calculation

In [9]:
def jaccard_sim(set1, set2):
    """
    Jaccard similarity between two sets of keywords
    Args:
        set1 (list): words/tokens
        set2 (list): words/tokens
    
    Return:
        float: jaccard similarity
    """
    
    if not isinstance(set1, set):
        set1 = set(set1)
    
    if not isinstance(set2, set):
        set2 = set(set2)
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union != 0 else 0

## Average runtime - 1s

### 4.2.1 ID Detection

* Selecting unique words longer than five characters that are not included in English vocab of ParaCrawl dataset
* Extracted 3 features: name_id, short_description_id, all_texts_id

Paracrawl:
* Data dimensions: 9 GB, 50,632,000 lines
* Created english vocab with 4,400,347 unique tokens of length more than 5 
* Used english vocab from NLTK to deal with resource shortage for computations 

In [10]:
# Vocab from paracrawl dataset
# Run paracrawl.py file separately to create list of tokens
english_vocab = open("features/english_words.txt", encoding='utf-8').read().splitlines()
english_vocab = [word.lower() for word in english_vocab if len(word)>5]
print(f"Number of tokens in Paracrawl English vocab: {len(english_vocab)}")

## Average runtime - 2s

Number of tokens in Paracrawl English vocab: 4400347


In [11]:
# NLTK english corpus for computational efficiency
english_vocab = [word for word in words.words() if len(word)>5]

def detect_unique_ids(text, vocab=english_vocab):
    """
    Detect unique words longer than five characters not in the vocabulary
    Args:
        text (str): string containing product information
        vocab (list): list of tokens
    
    Return:
        list: words that are not part of english vocab
    """

    words = re.findall(r'\b[\w\-.,\'!$&*]{6,}\b', text.lower())
    return [word for word in words if word not in vocab]

def calculate_id_detection(text1, text2, vocab=english_vocab):
    """
    Calculate ID detection keywords similarity between two products 
    Args:
        text1 (str): string containing product information
        text2 (str): string containing product information
        vocab (list): list of tokens
        
    Return:
        int: jaccard similarity between the IDs
    """
    set1 = detect_unique_ids(text1, vocab)
    set2 = detect_unique_ids(text2, vocab)
    
    return jaccard_sim(set1, set2)

def append_to_set(set_i,list_i):
    set_i.update(list_i)
    return set_i
## Average runtime - 1s

In [12]:
# Calculate cosine similarity between name, short and long description columns of 2 products
features_df['name_id'] = processed_df.apply(lambda row: calculate_id_detection(row['name1'], row['name2']), axis=1)
features_df['short_description_id'] = processed_df.apply(lambda row: calculate_id_detection(row['short_description1'], row['short_description2']), axis=1)
features_df['all_texts_id'] = processed_df.apply(lambda row: calculate_id_detection(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 15m

In [None]:
## Generating list of all ids for both the products from all the attributes
processed_df['ids_list1'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list1',set()),detect_unique_ids(row['name1'])),axis=1)
processed_df['ids_list1'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list1',set()),detect_unique_ids(row['short_description1'])),axis=1)
processed_df['ids_list1'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list1',set()),detect_unique_ids(row['long_description1'])),axis=1)
processed_df['ids_list1'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list1',set()),detect_unique_ids(row['all_texts1'])),axis=1)
processed_df['ids_list2'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list2',set()),detect_unique_ids(row['name2'])),axis=1)
processed_df['ids_list2'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list2',set()),detect_unique_ids(row['short_description2'])),axis=1)
processed_df['ids_list2'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list2',set()),detect_unique_ids(row['long_description2'])),axis=1)
processed_df['ids_list2'] = processed_df.apply(lambda row: append_to_set(row.get('ids_list2',set()),detect_unique_ids(row['all_texts2'])),axis=1)

In [None]:
features_df['name_id']

### 4.2.2 Brand Detections

* Brand vocabulary created by processing the specification columns of source and target website
* Extracted 3 features: name_brand, short_description_brand, all_texts_brand

`REF` - https://stackoverflow.com/questions/5319922/check-if-a-word-is-in-a-string-in-python

In [None]:
brand_vocab = text_processor.process_brand_names("specification1")
brand_vocab += text_processor.process_brand_names("specification2")
brand_vocab = set(brand_vocab)

def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

def detect_brands(text, brand_vocab=brand_vocab):
    """
    Detect brands in the given text
    Args:
        text (str): string from the product information
        brand_vocab (list): list of brands
    
    Returns:
        list: brands detected in the product info
    """

    return [word.lower() for word in brand_vocab if findWholeWord(word)(text)]


def calculate_brand_detection(text1, text2, vocab=brand_vocab):
    """
    Jaccard Similarity between the identified brands
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
        brand_vocab (list): list of brands
    
    Returns:
        int: Jaccard similarity between the brands of 2 products
    """
    
    set1 = detect_brands(text1, vocab)
    set2 = detect_brands(text2, vocab)
    
    return jaccard_sim(set1, set2)

## Average runtime - 1s

In [None]:
features_df['name_brand'] = processed_df.apply(lambda row: calculate_brand_detection(row['name1'], row['name2']), axis=1)
features_df['short_description_brand'] = processed_df.apply(lambda row: calculate_brand_detection(row['short_description1'], row['short_description2']), axis=1)
features_df['all_texts_brand'] = processed_df.apply(lambda row: calculate_brand_detection(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 1m 30s


In [None]:
processed_df['brand_list1'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list1',set()),detect_brands(row['name1'])),axis=1)
processed_df['brand_list1'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list1',set()),detect_brands(row['short_description1'])),axis=1)
processed_df['brand_list1'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list1',set()),detect_brands(row['all_texts1'])),axis=1)
processed_df['brand_list2'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list2',set()),detect_brands(row['name2'])),axis=1)
processed_df['brand_list2'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list2',set()),detect_brands(row['short_description2'])),axis=1)
processed_df['brand_list2'] = processed_df.apply(lambda row: append_to_set(row.get('brand_list2',set()),detect_brands(row['all_texts2'])),axis=1)

### 4.2.3 Number Detections

* If no units are found near the number, the number is detected as a free number
* Free numbers can contain model numbers or other crucial information
* Extracted 5 features: name_numbers, short_description_numbers, long_description_numbers, specification_text_numbers, all_texts_numbers

In [None]:
def detect_numbers(text):
    """
    Detect free numbers in the given text
    Args:
        text (str): string from the product information
    
    Returns:
        list: free numbers detected in the product info
    """
    
    return [float(match.group()) for match in re.finditer(r'\b\d+(\.\d+)?\b', text)]

def calculate_numbers_detection(text1, text2):
    """
    Jaccard Similarity between the identified free numbers 
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Jaccard similarity between the free numbers of 2 products
    """
    
    set1 = detect_numbers(text1)
    set2 = detect_numbers(text2)
    
    return jaccard_sim(set1, set2)

## Average runtime - 1s


In [None]:
features_df['name_numbers'] = processed_df.apply(lambda row: calculate_numbers_detection(row['name1'], row['name2']), axis=1)
features_df['short_description_numbers'] = processed_df.apply(lambda row: calculate_numbers_detection(row['short_description1'], row['short_description2']), axis=1)
features_df['long_description_numbers'] = processed_df.apply(lambda row: calculate_numbers_detection(row['long_description1'], row['long_description2']), axis=1)
features_df['specification_text_numbers'] = processed_df.apply(lambda row: calculate_numbers_detection(row['specification1'], row['specification2']), axis=1)
features_df['all_texts_numbers'] = processed_df.apply(lambda row: calculate_numbers_detection(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 1s

In [None]:
processed_df['numbers_list1'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list1',set()),detect_numbers(row['name1'])),axis=1)
processed_df['numbers_list1'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list1',set()),detect_numbers(row['short_description1'])),axis=1)
processed_df['numbers_list1'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list1',set()),detect_numbers(row['long_description1'])),axis=1)
processed_df['numbers_list1'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list1',set()),detect_numbers(row['all_texts1'])),axis=1)
processed_df['numbers_list2'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list2',set()),detect_numbers(row['name2'])),axis=1)
processed_df['numbers_list2'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list2',set()),detect_numbers(row['short_description2'])),axis=1)
processed_df['numbers_list2'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list2',set()),detect_numbers(row['long_description2'])),axis=1)
processed_df['numbers_list2'] = processed_df.apply(lambda row: append_to_set(row.get('numbers_list2',set()),detect_numbers(row['all_texts2'])),axis=1)

### 4.2.4 Descriptive words

* Set of the most characterising words for each attribute of the product
* Extracted 4 features: name_descriptives, short_description_descriptives, long_description_descriptives, all_texts_descriptives

In [None]:
def calculate_word_document_frequency(word,documents):
    counter = 0
    for document in documents:
        if word in document:
            counter+=1
    return counter

def create_word_frequency_dict(dataframe, column):
    words = dict()
    for row in dataframe[column]:
        for word in word_tokenize(row):
            words[word] = words.get(word,calculate_word_document_frequency(word,dataframe[column]))
    return words

## Average runtime - 1s

In [None]:
name1_words_frequency_dict = create_word_frequency_dict(processed_df,'name1')
name2_words_frequency_dict = create_word_frequency_dict(processed_df,'name2')
short_description1_words_frequency_dict = create_word_frequency_dict(processed_df,'short_description1')
short_description2_words_frequency_dict = create_word_frequency_dict(processed_df,'short_description2')
long_description1_words_frequency_dict = create_word_frequency_dict(processed_df,'long_description1')
long_description2_words_frequency_dict = create_word_frequency_dict(processed_df,'long_description2')
all_texts1_words_frequency_dict = create_word_frequency_dict(processed_df,'all_texts1')
all_texts2_words_frequency_dict = create_word_frequency_dict(processed_df,'all_texts2')

## Average runtime - 23min

In [None]:
def detect_descriptive_words(text,words_frequency_dict,documents_len,top_k = 50, maximum_p = 0.5):
    """
    Detect descriptive words in the given text
    Args:
        text (str): string from the product information
    
    Returns:
        list: descriptive words detected in the product info
    """
    all_words =  [(word,words_frequency_dict[word]) for word in word_tokenize(text) if words_frequency_dict[word] < maximum_p*documents_len]
    all_words.sort(key = lambda row: row[0])
    return all_words[-top_k:]

def calculate_descriptive_words(text1, text2,words_frequency_dict1,words_frequency_dict2,documents_len):
    """
    Jaccard Similarity between the identified descriptive words 
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Jaccard similarity between the descriptive words of 2 products
    """
    
    set1 = detect_descriptive_words(text1,words_frequency_dict1,documents_len)
    set2 = detect_descriptive_words(text2,words_frequency_dict2,documents_len)
    
    return jaccard_sim(set1, set2)


In [None]:
documents_len = len(processed_df)
features_df['name_descriptives'] = processed_df.apply(lambda row: calculate_descriptive_words(row['name1'], row['name2'],name1_words_frequency_dict,name2_words_frequency_dict,documents_len), axis=1)

features_df['short_description_descriptives'] = processed_df.apply(lambda row: calculate_descriptive_words(row['short_description1'], row['short_description2'],short_description1_words_frequency_dict,short_description2_words_frequency_dict,documents_len), axis=1)

features_df['long_description_descriptives'] = processed_df.apply(lambda row: calculate_descriptive_words(row['long_description1'], row['long_description2'],long_description1_words_frequency_dict,long_description2_words_frequency_dict,documents_len), axis=1)

features_df['all_texts_descriptives'] = processed_df.apply(lambda row: calculate_descriptive_words(row['all_texts1'], row['all_texts2'],all_texts1_words_frequency_dict,all_texts2_words_frequency_dict,documents_len), axis=1)

## Average runtime - 4s

### 4.2.5 Unit Detection

* Extraction of numbers followed by units from each attribute 
* Extracted 5 features: name_units, short_description_units, long_description_units, specification_text_units, all_texts_units

In [None]:
def detect_units(text):
    """
    Detect numbers which are accompanied by units
    Args:
        text (str): string from the product information
    
    Returns:
        list: numbers around units detected in the product info
    """
    matches = re.findall(r'\b(\d+(\.\d+)?)\s*([a-zA-Z]+)\b', text)

    return [match[0] for match in matches]

def calculate_unit_detection(text1, text2):
    """
    Jaccard Similarity between the identified descriptive words 
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Jaccard similarity between the detected numbers around units of 2 products
    """
    set1 = detect_units(text1)
    set2 = detect_units(text2)
    
    return jaccard_sim(set1, set2)

## Average runtime - 1s


In [None]:
features_df['name_units'] = processed_df.apply(lambda row: calculate_unit_detection(row['name1'], row['name2']), axis=1)
features_df['short_description_units'] = processed_df.apply(lambda row: calculate_unit_detection(row['short_description1'], row['short_description2']), axis=1)
features_df['long_description_units'] = processed_df.apply(lambda row: calculate_unit_detection(row['long_description1'], row['long_description2']), axis=1)
features_df['specification_text_units'] = processed_df.apply(lambda row: calculate_unit_detection(row['specification1'], row['specification2']), axis=1)
features_df['all_texts_units'] = processed_df.apply(lambda row: calculate_unit_detection(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 1s

In [None]:
## Generating list of all units for both the products from all the attributes
processed_df['units_list1'] = processed_df.apply(lambda row: append_to_set(row.get('units_list1',set()),detect_units(row['name1'])),axis=1)
processed_df['units_list1'] = processed_df.apply(lambda row: append_to_set(row.get('units_list1',set()),detect_units(row['short_description1'])),axis=1)
processed_df['units_list1'] = processed_df.apply(lambda row: append_to_set(row.get('units_list1',set()),detect_units(row['long_description1'])),axis=1)
processed_df['units_list1'] = processed_df.apply(lambda row: append_to_set(row.get('units_list1',set()),detect_units(row['all_texts1'])),axis=1)
processed_df['units_list2'] = processed_df.apply(lambda row: append_to_set(row.get('units_list2',set()),detect_units(row['name2'])),axis=1)
processed_df['units_list2'] = processed_df.apply(lambda row: append_to_set(row.get('units_list2',set()),detect_units(row['short_description2'])),axis=1)
processed_df['units_list2'] = processed_df.apply(lambda row: append_to_set(row.get('units_list2',set()),detect_units(row['long_description2'])),axis=1)
processed_df['units_list2'] = processed_df.apply(lambda row: append_to_set(row.get('units_list2',set()),detect_units(row['all_texts2'])),axis=1)

In [None]:
processed_df.columns

Index(['name1', 'short_description1', 'long_description1', 'name2',
       'short_description2', 'long_description2', 'specification1',
       'specification2', 'all_texts1', 'all_texts2', 'orig_specification1',
       'orig_specification2', 'match', 'brand_list1', 'brand_list2',
       'numbers_list1', 'numbers_list2', 'units_list1', 'units_list2'],
      dtype='object')

### 4.2.6 Words

* Ratio of the same words taking all words from corresponding attributes of two products 
* Extracted 3 features: name_words, short_description_words, all_texts_words

In [None]:
def calculate_words(text1, text2):
    """
    Ratio of common words to all words 
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Jaccard similarity between the words of 2 products
    """
    
    set1 = set(word for word in text1.lower())
    set2 = set(word for word in text2.lower())
    
    return jaccard_sim(set1, set2)

## Average runtime - 1s

In [None]:
features_df['name_words'] = processed_df.apply(lambda row: calculate_words(row['name1'], row['name2']), axis=1)
features_df['short_description_words'] = processed_df.apply(lambda row: calculate_words(row['short_description1'], row['short_description2']), axis=1)
features_df['all_texts_words'] = processed_df.apply(lambda row: calculate_words(row['all_texts1'], row['all_texts2']), axis=1)

## Average runtime - 1s

### 4.3 All Detected Keywords Comparisons

* Ratio of matching values in those lists between two compared products
* Extracted 4 features: all_units_list, all_ids_list, all_numbers_list, all_brands_list

In [None]:
def ratio(set1, set2):
    """
    Ratio of common elements to total (can contain repetitions)
    Args:
        list1 (list): result from each type of detection
        list2 (list): result from each type of detection
    
    Return:
        int: common/total
    """
    
    total = len(set1.union(set2))
    
    return len(set1)/total if total != 0 else 0

In [None]:
features_df['all_units_list'] = processed_df.apply(lambda row: ratio(row['units_list1'], row['units_list2']), axis=1)
features_df['all_ids_list'] = processed_df.apply(lambda row: ratio(row['ids_list1'], row['ids_list2']), axis=1)
features_df['all_numbers_list'] = processed_df.apply(lambda row: ratio(row['numbers_list1'], row['numbers_list2']), axis=1)
features_df['all_brands_list'] = processed_df.apply(lambda row: ratio(row['brand_list1'], row['brand_list2']), axis=1)

### 4.4 Specification preprocessing

* Ratio of corresponding parameter names as specification_key
* Ratio of corresponding parameter names and values as specification_key_value
* Extracted 2 features: specification_key, specification_key_value

In [None]:
def calculate_key_value_match(text1, text2):
    """
    Common key value pairs in the specification column
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Ratio between common pairs and total pairs of the 2 products
    """
    set1 = set([(d["key"],d["value"]) for d in eval(text1.lower())])
    set2 = set([(d["key"],d["value"]) for d in eval(text2.lower())])

    return jaccard_sim(set1,set2)

def calculate_key_match(list1, list2):
    """
    Common keys in the specification column
    Args:
        text1 (str): string from the product information
        text2 (str): string from the product information
    
    Returns:
        int: Ratio between common keys and total keys of the 2 products
    """
    set1 = [d["key"] for d in eval(list1.lower())]
    set2 = [d["key"] for d in eval(list2.lower())]

    return jaccard_sim(set1, set2)

## Average runtime - 1s

In [None]:
features_df['specification_key'] = processed_df.apply(lambda row: calculate_key_match(row['orig_specification1'], row['orig_specification2']), axis=1)
features_df['specification_key_value'] = processed_df.apply(lambda row: calculate_key_value_match(row['orig_specification1'], row['orig_specification2']), axis=1)

## Average runtime - 1s

### 4.5 Add Image Hash Similarities

* Join the image hash similarity with text processing features 

In [None]:
image_hash_train = pd.read_csv("features/ProMapEn/images_train_similarties.csv")
image_hash_test = pd.read_csv("features/ProMapEn/images_test_similarties.csv")

image_hashes = pd.concat([image_hash_train, image_hash_test], ignore_index=True)

print(image_hashes.shape)
image_hashes.head(3)

(1555, 1)


Unnamed: 0,hash_similarity
0,0.0
1,0.0
2,0.0


In [None]:
features_df["hash_similarity"] = image_hashes["hash_similarity"]

### 4.6 Label Stats


In [None]:
features_df['match'] = processed_df["match"]
features_df['match'].value_counts()

match
0    1046
1     509
Name: count, dtype: int64

In [None]:
print(features_df.shape)
features_df.head()

(1555, 19)


Unnamed: 0,name_cos,short_description_cos,long_description_cos,all_texts_cos,name_brand,short_description_brand,all_texts_brand,name_numbers,short_description_numbers,long_description_numbers,specification_text_numbers,all_texts_numbers,all_units_list,all_numbers_list,all_brands_list,specification_key,specification_key_value,hash_similarity,match
0,0.230623,0.096354,0.194154,0.567405,1.0,0.0,1.0,0.111111,0.0,0.0,0.0,0.3125,0.571429,0.625,1.0,0.0,0.0,0.0,0
1,0.178245,0.299044,0.377834,0.71701,0.0,0.0,1.0,0.25,0.666667,0.333333,0.0,0.4,0.916667,0.933333,1.0,0.0,0.0,0.0,0
2,0.07973,0.549188,0.747834,0.787388,0.0,1.0,1.0,0.0,0.5,0.0,0.0,0.166667,0.75,0.846154,1.0,0.0,0.0,0.0,0
3,0.706158,0.385939,0.272149,0.71522,1.0,1.0,1.0,1.0,0.1,0.0,0.0,0.307692,0.818182,0.846154,1.0,0.0,0.0,0.0,1
4,0.912291,0.0,0.34668,0.919648,1.0,0.0,1.0,1.0,0.0,0.2,0.0,0.75,1.0,1.0,1.0,0.0,0.0,0.0,0


## 5.0 Save the features into CSV file

* Train set: 1244 rows
* Test set: 311 rows

In [None]:
train_df = features_df.iloc[:1244, :]
test_df = features_df.iloc[1244:, :]

print("Shape of the training data", train_df.shape)
print("Shape of the testing data", test_df.shape)

Shape of the training data (1244, 19)
Shape of the testing data (311, 19)


In [None]:
train_df.to_csv("features/ProMapEn/promapen_train_similarities.csv", header=True, index=False)
test_df.to_csv("features/ProMapEn/promapen_test_similarities.csv", header=True, index=False)

In [None]:
# ## Reading the features again id 

# train_df = pd.read_csv("features/ProMapEn/promapen_train_similarities.csv")
# test_df = pd.read_csv("features/ProMapEn/promapen_test_similarities.csv")
# features_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
columnslist = ['name_cos','long_description_descriptives','all_texts_cos','all_numbers_list','name_id','long_description_units','all_texts_brand','all_ids_list','name_brand','short_description_cos','all_texts_id','all_units_list','name_numbers','short_description_id','all_texts_numbers','specification_text_numbers','name_descriptives','short_description_brand','all_texts_descriptives','specification_text_units','name_units','short_description_numbers','all_texts_units','specification_key','name_words','short_description_descriptives','all_texts_words','specification_key_value','long_description_cos','short_description_units','all_brands_list','hash_similarity','long_description_numbers','short_description_words']

set(columnslist) - set(features_df.columns)

{'all_ids_list',
 'all_texts_descriptives',
 'all_texts_units',
 'all_texts_words',
 'long_description_descriptives',
 'long_description_units',
 'name_descriptives',
 'name_units',
 'name_words',
 'short_description_descriptives',
 'short_description_units',
 'short_description_words',
 'specification_text_units'}