In [40]:
import pandas as pd

In [41]:
articles = pd.read_csv('articles.csv')

In [42]:
articles.head()

Unnamed: 0.1,Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [43]:
# Taking only 10000 items :-
articles = articles[:10000]

In [44]:
# Combine all info from descriptive columns to a single column separated by space :-
cols = ['prod_name', 'product_type_name', 'product_group_name',
        'graphical_appearance_name', 'colour_group_name',
       
        'department_name', 'index_name', 'index_group_name', 'section_name',
        'garment_group_name', 'detail_desc']

articles['combined_cols'] = articles[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [45]:
articles = articles[['article_id', 'combined_cols']]

In [46]:
articles.shape

(5, 2)

In [47]:
articles.head()

Unnamed: 0,article_id,combined_cols
0,108775015,Strap top Vest top Garment Upper body Solid Bl...
1,108775044,Strap top Vest top Garment Upper body Solid Wh...
2,108775051,Strap top (1) Vest top Garment Upper body Stri...
3,110065001,OP T-shirt (Idro) Bra Underwear Solid Black Cl...
4,110065002,OP T-shirt (Idro) Bra Underwear Solid White Cl...


In [48]:
# Download the stopwords corpus and Porter stemming algorithm :-

# import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

In [49]:
# Data cleaning :-
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def text_process(desc):
    articles['combined_cols'].fillna(value='', inplace=True) # Fill the null values with empty string
    # Remove punctuation :-
    noPunc = [c for c in desc if c not in string.punctuation]
    noPunc = ''.join(noPunc)
    noPunc = noPunc.split()
    # Remove stopwords :-
    stopword = stopwords.words('english')
    desc_stopwords = [word.lower() for word in noPunc if word.lower() not in stopword]
    # Replace words with their respective stems :-
    stemmer = PorterStemmer()
    desc_cleaned = [stemmer.stem(word) for word in desc_stopwords]
    return desc_cleaned

In [50]:
# Vectorizing the data :-
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=text_process)
tfidf_matrix = tfidf.fit_transform(articles['combined_cols'])

In [51]:
# Recommender system :-
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [52]:
# Map the article_id to their indices :-
indices = pd.Series(articles.index, index=articles['article_id']).drop_duplicates()

In [53]:
# Method to predict similar articles :-
def recommendations(article_id):
    i = indices[article_id] # Index of the articles that match the given article
    sim_scores = list(enumerate(cos_sim[i])) # Similarity scores of all articles w.r.t. to the given article
    sim_scores.sort(key=lambda x: x[1], reverse=True) # Sort the similarity scores in descending order
    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:10]
    # Get the article indices
    article_indices = [score[0] for score in sim_scores]
    return articles['article_id'].iloc[article_indices].values

In [54]:
# Get the recommendations for a sample product :-
# predict_recom = recommendations(118458003)
# print(predict_recom)

In [55]:
# Save the recommender as a file :-
import pickle

# Save the similarity matrix to a file using pickle :-
with open('recommender.pkl', 'wb') as file:
    pickle.dump(cos_sim, file)

In [56]:
# Method for showing search results with a given description of the product :-
def search_result(desc):
    search_tfidf = tfidf.transform([desc])
    cos_sim = cosine_similarity(search_tfidf, tfidf_matrix)
    sim_scores = list(enumerate(cos_sim[0]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:50]
    article_indices = [score[0] for score in sim_scores]
    return articles['article_id'].iloc[article_indices].values

In [57]:
# Sample search :-
search_result("jogger women")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  articles['combined_cols'].fillna(value='', inplace=True) # Fill the null values with empty string


array([110065002, 110065001, 108775044, 108775015, 108775051])

In [58]:
# Save both the TfidfVectorizer and tfidf_matrix as files for searching products :-
with open('tfidf.pkl', 'wb') as file:
    pickle.dump(tfidf, file)
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)