## Import library 

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from scipy.sparse import coo_matrix
from spacy.cli.train import train

In [2]:
# read dataset , tsv file meaning that value separated by \t
train_data = pd.read_csv('rawdata.tsv', sep='\t')
train_data.columns

Index(['Uniq Id', 'Crawl Timestamp', 'Dataset Origin', 'Product Id',
       'Product Barcode', 'Product Company Type Source',
       'Product Brand Source', 'Product Brand Normalised Source',
       'Product Name Source', 'Match Rank', 'Match Score', 'Match Type',
       'Retailer', 'Product Category', 'Product Brand', 'Product Name',
       'Product Price', 'Sku', 'Upc', 'Product Url', 'Market',
       'Product Description', 'Product Currency',
       'Product Available Inventory', 'Product Image Url',
       'Product Model Number', 'Product Tags', 'Product Contents',
       'Product Rating', 'Product Reviews Count', 'Bsr', 'Joining Key'],
      dtype='object')

In [3]:
train_data.shape

(5000, 32)

## Remove unnecessary column

In [4]:
#Filtering of data as per our need choosing necessary columns
train_data = train_data[['Uniq Id','Product Id', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Name', 'Product Price', 'Product Image Url', 'Product Description', 'Product Tags']]
train_data.head(3)

Unnamed: 0,Uniq Id,Product Id,Product Rating,Product Reviews Count,Product Category,Product Brand,Product Name,Product Price,Product Image Url,Product Description,Product Tags
0,1705736792d82aa2f2d3caf1c07c53f4,2e17bf4acecdece67fc00f07ad62c910,,,Premium Beauty > Premium Makeup > Premium Nail...,OPI,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",8.95,https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,,"OPI Infinite Shine, Nail Lacquer Nail Polish, ..."
1,95a9fe6f4810fcfc7ff244fd06784f11,076e5854a62dd283c253d6bae415af1f,,,Beauty > Hair Care > Hair Color > Auburn Hair ...,Nice'n Easy,"Nice n Easy Permanent Color, 111 Natural Mediu...",29.86,https://i5.walmartimages.com/asr/9c8e42e4-13a5...,Pack of 3 Pack of 3 for the UPC: 381519000201 ...,"Nice 'n Easy Permanent Color, 111 Natural Medi..."
2,8d4d0330178d3ed181b15a4102b287f2,8a4fe5d9c7a6ed26cc44d785a454b124,4.5,29221.0,Beauty > Hair Care > Hair Color > Permanent Ha...,Clairol,Clairol Nice N Easy Permanent Color 7/106A Nat...,7.99,https://i5.walmartimages.com/asr/e3a601c2-6a2b...,This Clairol Nice N Easy Permanent Color gives...,Clairol Nice 'N Easy Permanent Color 7/106A Na...


In [5]:
train_data['Product Tags']

0       OPI Infinite Shine, Nail Lacquer Nail Polish, ...
1       Nice 'n Easy Permanent Color, 111 Natural Medi...
2       Clairol Nice 'N Easy Permanent Color 7/106A Na...
3       Kokie Professional Matte Lipstick, Hot Berry, ...
4       Gillette TRAC II Plus Razor Blade Refills, Fit...
                              ...                        
4995    Garden Mint Room Spray (Double Strength), 4 ou...
4996    Garnier Nutrisse Nourishing Hair Color Creme (...
4997    Nail File Electric Drill, 6 in 1 Professional ...
4998    Creed Love In Black Hair And Body Wash 6.8oz/2...
4999                    Foundation, Wal-mart, Walmart.com
Name: Product Tags, Length: 5000, dtype: object

In [6]:
train_data.shape

(5000, 11)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                5000 non-null   object 
 1   Product Id             5000 non-null   object 
 2   Product Rating         2194 non-null   float64
 3   Product Reviews Count  3346 non-null   float64
 4   Product Category       4990 non-null   object 
 5   Product Brand          4987 non-null   object 
 6   Product Name           5000 non-null   object 
 7   Product Price          4958 non-null   float64
 8   Product Image Url      5000 non-null   object 
 9   Product Description    3873 non-null   object 
 10  Product Tags           5000 non-null   object 
dtypes: float64(3), object(8)
memory usage: 429.8+ KB


In [8]:
# Handling with missing value
train_data.isnull().sum()

Uniq Id                     0
Product Id                  0
Product Rating           2806
Product Reviews Count    1654
Product Category           10
Product Brand              13
Product Name                0
Product Price              42
Product Image Url           0
Product Description      1127
Product Tags                0
dtype: int64

## Fill null value

In [9]:
train_data = train_data.dropna(subset=['Product Price'])
# Fill missing values in 'Product Rating' with a default value (e.g., 0)
train_data['Product Rating'] = train_data['Product Rating'].fillna(0)

# Fill missing values in 'Product Reviews Count' with a default value (e.g., 0)
train_data['Product Reviews Count'] = train_data['Product Reviews Count'].fillna(0)

# Fill missing values in 'Product Category' with a default value (e.g., 'Unknown')
train_data['Product Category'] = train_data['Product Category'].fillna('')

# Fill missing values in 'Product Brand' with a default value (e.g., 'Unknown')
train_data['Product Brand'] = train_data['Product Brand'].fillna('')

# Fill missing values in 'Product Description' with an empty string
train_data['Product Description'] = train_data['Product Description'].fillna('')

In [10]:
train_data.isnull().sum()

Uniq Id                  0
Product Id               0
Product Rating           0
Product Reviews Count    0
Product Category         0
Product Brand            0
Product Name             0
Product Price            0
Product Image Url        0
Product Description      0
Product Tags             0
dtype: int64

In [11]:
train_data.duplicated().sum()

np.int64(0)

## Rename column

In [12]:
# make columns shorter
# Define the mapping of current column names to shorter names
column_name_mapping = {
    'Uniq Id': 'ID',
    'Product Id': 'ProdID',
    'Product Rating': 'Rating',
    'Product Reviews Count': 'ReviewCount',
    'Product Category': 'Category',
    'Product Brand': 'Brand',
    'Product Name': 'Name',
    'Product Price': 'Price',
    'Product Image Url': 'ImageURL',
    'Product Description': 'Description',
    'Product Tags': 'Tags',
    'Product Contents': 'Contents'    
}
# Rename the columns using the mapping
train_data.rename(columns=column_name_mapping, inplace=True)

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4958 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           4958 non-null   object 
 1   ProdID       4958 non-null   object 
 2   Rating       4958 non-null   float64
 3   ReviewCount  4958 non-null   float64
 4   Category     4958 non-null   object 
 5   Brand        4958 non-null   object 
 6   Name         4958 non-null   object 
 7   Price        4958 non-null   float64
 8   ImageURL     4958 non-null   object 
 9   Description  4958 non-null   object 
 10  Tags         4958 non-null   object 
dtypes: float64(3), object(8)
memory usage: 464.8+ KB


In [14]:
#change value to numeric 
train_data['ID'] = train_data['ID'].str.extract(r'(\d+)').astype(float)
train_data['ProdID'] = train_data['ProdID'].str.extract(r'(\d+)').astype(float)

## Creating tags 

In [15]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     ---------------------------------------- 0.1/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.3/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.6/12.8 MB 3.5 MB/s eta 0:00:04
     -- ------------------------------------- 0.8/12.8 MB 3.8 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.1 MB/s eta 0:00:03
     ---- ----------------------------------- 1.4/12.8 MB 4.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.6/12.8 MB 4.2 MB/s eta 0:00:03
     ----- ---------------------------------- 1.9/12.8 MB 4.6 MB/s eta 0:00:03
     ------ ----------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

def clean_and_extract_tags(text):
    doc = nlp(text.lower())
    tags = [token.text for token in doc if token.text.isalnum() and token.text not in STOP_WORDS]
    return ', '.join(tags)

# List of columns to extract tags from, including 'Name'
columns_to_extract_tags_from = ['Name', 'Category', 'Brand', 'Description']

# Create new columns for cleaned tags
for column in columns_to_extract_tags_from:
    train_data[f'{column}_Tags'] = train_data[column].apply(clean_and_extract_tags)

# Concatenate the cleaned tags from all relevant columns into a 'Tags' column
train_data['Tags'] = train_data[[f'{col}_Tags' for col in columns_to_extract_tags_from]].apply(lambda row: ', '.join(row), axis=1)

# Example of how the DataFrame looks after adding the new columns
train_data.head()


Unnamed: 0,ID,ProdID,Rating,ReviewCount,Category,Brand,Name,Price,ImageURL,Description,Tags,Name_Tags,Category_Tags,Brand_Tags,Description_Tags
0,1705737000.0,2.0,0.0,0.0,Premium Beauty > Premium Makeup > Premium Nail...,OPI,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",8.95,https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,,"opi, infinite, shine, nail, lacquer, nail, pol...","opi, infinite, shine, nail, lacquer, nail, pol...","premium, beauty, premium, makeup, premium, nai...",opi,
1,95.0,76.0,0.0,0.0,Beauty > Hair Care > Hair Color > Auburn Hair ...,Nice'n Easy,"Nice n Easy Permanent Color, 111 Natural Mediu...",29.86,https://i5.walmartimages.com/asr/9c8e42e4-13a5...,Pack of 3 Pack of 3 for the UPC: 381519000201 ...,"nice, n, easy, permanent, color, 111, natural,...","nice, n, easy, permanent, color, 111, natural,...","beauty, hair, care, hair, color, auburn, hair,...",easy,"pack, 3, pack, 3, upc, 381519000201, beautiful..."
2,8.0,8.0,4.5,29221.0,Beauty > Hair Care > Hair Color > Permanent Ha...,Clairol,Clairol Nice N Easy Permanent Color 7/106A Nat...,7.99,https://i5.walmartimages.com/asr/e3a601c2-6a2b...,This Clairol Nice N Easy Permanent Color gives...,"clairol, nice, n, easy, permanent, color, natu...","clairol, nice, n, easy, permanent, color, natu...","beauty, hair, care, hair, color, permanent, ha...",clairol,"clairol, nice, n, easy, permanent, color, give..."
3,4.0,3.0,0.0,0.0,Beauty > Makeup > Lip,Kokie Cosmetics,"Kokie Professional Matte Lipstick, Hot Berry, ...",5.16,https://i5.walmartimages.com/asr/25b4b467-bc61...,Calling all matte lip lovers! Indulge in our r...,"kokie, professional, matte, lipstick, hot, ber...","kokie, professional, matte, lipstick, hot, ber...","beauty, makeup, lip","kokie, cosmetics","calling, matte, lip, lovers, indulge, rich, cr..."
4,990.0,3.0,0.0,131.0,Seasonal > Stock Up Essentials > Personal Care...,Gillette,"Gillette TRAC II Plus Razor Blade Refills, Fit...",19.97,https://i5.walmartimages.com/asr/1a2ebb06-cd01...,"In 1971, Gillette introduced the Trac II razor...","gillette, trac, ii, plus, razor, blade, refill...","gillette, trac, ii, plus, razor, blade, refill...","seasonal, stock, essentials, personal, care, s...",gillette,"1971, gillette, introduced, trac, ii, razor, s..."


In [17]:
train_data.columns

Index(['ID', 'ProdID', 'Rating', 'ReviewCount', 'Category', 'Brand', 'Name',
       'Price', 'ImageURL', 'Description', 'Tags', 'Name_Tags',
       'Category_Tags', 'Brand_Tags', 'Description_Tags'],
      dtype='object')

In [18]:
train_data=train_data[['ID', 'ProdID', 'Rating', 'ReviewCount', 'Category', 'Brand', 'Name',
       'Price', 'ImageURL', 'Description', 'Tags']]


train_data.to_csv('data/clean_data.csv', index=True)

