In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [14]:
transactions = pd.read_csv("data/transactions_train.csv")
customers = pd.read_csv("data/customers.csv")
articles = pd.read_csv("data/articles.csv")

In [15]:
articles.isnull().any()
missing_count = articles['detail_desc'].isnull().sum()
print(missing_count)

416


In [16]:
# Removing missing data

articles = articles.dropna(subset=['detail_desc'])
articles = articles.reset_index(drop=True)
articles.shape

(105126, 25)

In [17]:
articles['product_group_name'].value_counts()

product_group_name
Garment Upper body       42593
Garment Lower body       19763
Garment Full body        13245
Accessories              11141
Underwear                 5449
Shoes                     5196
Swimwear                  3104
Socks & Tights            2438
Nightwear                 1899
Unknown                    121
Underwear/nightwear         54
Cosmetic                    49
Bags                        25
Items                       17
Furniture                   13
Garment and Shoe care        9
Stationery                   5
Interior textile             3
Fun                          2
Name: count, dtype: int64

In [18]:
# Preparation for TF-IDF matrix

articles_str = articles.select_dtypes(include=['object'])
articles_str

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_code,index_name,index_group_name,section_name,garment_group_name,detail_desc
0,Strap top,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Basic,A,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,Strap top,Vest top,Garment Upper body,Solid,White,Light,White,Jersey Basic,A,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
2,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Jersey Basic,A,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
3,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark,Black,Clean Lingerie,B,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light,White,Clean Lingerie,B,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105121,5pk regular Placement1,Socks,Socks & Tights,Placement print,Black,Dark,Black,Socks Bin,F,Menswear,Menswear,Men Underwear,Socks and Tights,Socks in a fine-knit cotton blend with a small...
105122,SPORT Malaga tank,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey,A,Ladieswear,Ladieswear,H&M+,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...
105123,Cartwheel dress,Dress,Garment Full body,Solid,Black,Dark,Black,Jersey,A,Ladieswear,Ladieswear,Womens Trend,Jersey Fancy,"Short, A-line dress in jersey with a round nec..."
105124,CLAIRE HAIR CLAW,Hair clip,Accessories,Solid,Black,Dark,Black,Small Accessories,D,Divided,Divided,Divided Accessories,Accessories,Large plastic hair claw.


In [19]:
# Concatenate all text columns to create a document collection

documents = articles_str.apply(' '.join, axis=1)
print(documents)

0         Strap top Vest top Garment Upper body Solid Bl...
1         Strap top Vest top Garment Upper body Solid Wh...
2         Strap top (1) Vest top Garment Upper body Stri...
3         OP T-shirt (Idro) Bra Underwear Solid Black Da...
4         OP T-shirt (Idro) Bra Underwear Solid White Li...
                                ...                        
105121    5pk regular Placement1 Socks Socks & Tights Pl...
105122    SPORT Malaga tank Vest top Garment Upper body ...
105123    Cartwheel dress Dress Garment Full body Solid ...
105124    CLAIRE HAIR CLAW Hair clip Accessories Solid B...
105125    Lounge dress Dress Garment Full body Solid Off...
Length: 105126, dtype: object


In [20]:
import re
from tqdm import tqdm
from nltk.corpus import stopwords

def preprocess_text_with_progress(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(filtered_tokens)

# Apply preprocess_text_with_progress to each document
documents_processed = []

# Use tqdm to track the progress
for document in tqdm(documents):
    documents_processed.append(preprocess_text_with_progress(document))

print(documents_processed)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105126/105126 [19:31<00:00, 89.77it/s]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [28]:
print(documents_processed[0])

strap top vest top garment upper body solid black dark black jersey basic ladieswear ladieswear womens everyday basics jersey basic jersey top narrow shoulder straps


In [29]:
# TF-IDF vectorization

tfidf = TfidfVectorizer()


In [30]:
# Convert the document collection to TF-IDF vectors

tfidf_matrix = tfidf.fit_transform(documents_processed)
# print(tfidf_matrix.toarray())

In [24]:
# Examining the TF-IDF matrix

tfidf_matrix.shape
tfidf.get_feature_names_out()

array(['0241590', '0241602', '0391750', ..., 'öppen', 'örhängen', 'ﬁbres'],
      dtype=object)

In [31]:
def most_similar_tfidf_vectors(tfidf_matrix, tfidf_vector, n=6):
    # Compute cosine similarity between tfidf_vector and all vectors in tfidf_matrix
    similarities = cosine_similarity(tfidf_matrix, tfidf_vector.reshape(1, -1))
    
    # Flatten similarities array and get indexes of top n most similar vectors
    similar_indexes = np.argsort(similarities.flatten())[::-1][:n]
    
    similar_article_ids = articles.iloc[similar_indexes]
    
    return similar_article_ids

In [32]:
import random 

In [33]:
# The code snippet that generates recommendations for a random article using your content-based recommender
id = random.randint(0,tfidf_matrix.shape[0]-1)
tfidf_vector = tfidf_matrix[id].toarray()
print(articles[['article_id', 'prod_name', 'product_type_name']].iloc[id])
recommended_articles = most_similar_tfidf_vectors(tfidf_matrix, tfidf_vector, 6)

recommended_articles[['article_id','prod_name','product_type_name','product_group_name','department_name','index_name','detail_desc']]

article_id                            584200005
prod_name            Bianca off shoulder blouse
product_type_name                        Blouse
Name: 19282, dtype: object


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,department_name,index_name,detail_desc
19282,584200005,Bianca off shoulder blouse,Blouse,Garment Upper body,Tops Woven,Divided,Off-the-shoulder blouse in a cotton weave with...
19283,584200006,Bianca off shoulder blouse,Blouse,Garment Upper body,Tops Woven,Divided,Off-the-shoulder blouse in a cotton weave with...
93610,860646001,VIOLET LS BLOUSE,Blouse,Garment Upper body,Tops Woven,Divided,Cold shoulder blouse in a crêpe weave with nar...
55871,713699004,MOLLY OFF SHOULDER,Blouse,Garment Upper body,Tops Woven,Divided,"Short, off-the-shoulder blouse in a crêpe weav..."
55870,713699002,MOLLY OFF SHOULDER,Blouse,Garment Upper body,Tops Woven,Divided,"Short, off-the-shoulder blouse in a crêpe weav..."
55872,713699005,MOLLY OFF SHOULDER,Blouse,Garment Upper body,Tops Woven,Divided,"Short, off-the-shoulder blouse in a crêpe weav..."
