## 1.) Set Up

In [6]:
# 1.1 Module Imports
import re 
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [7]:
# 1.2 Defining the TFIDF
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word) for word 
                     in tokens if len(word) > 1 and not word in stop_words])

tfidf = TfidfVectorizer(analyzer='word', 
                     #ngram_range=(1, 3), 
                     preprocessor = preprocess,
                     min_df=2,  # May take out for overfit
                     stop_words='english')

In [9]:
# 1.3 Import data
df = pd.read_csv('./data/Macys_dresses_4_4_2020_cleaned.csv')
df.head(3)

Unnamed: 0,url,brand,title,price,colors,Web_ID,description,all_words
0,https://www.macys.com/shop/product/teeze-me-ju...,Teeze Me,Juniors' 2-Pc. Off-The-Shoulder Bodycon Gown,139.0,Red/White,10500778,Slay your next event in this two-piece gown fr...,Teeze Me Juniors' 2-Pc. Off-The-Shoulder Bodyc...
1,https://www.macys.com/shop/product/sadie-sage-...,Sadie & Sage,Long-Sleeve Ruffled Mini Dress,69.0,Navy,10560007,"So sweet and chic with a tiered ruffled skirt,...",Sadie & Sage Long-Sleeve Ruffled Mini Dress Na...
2,https://www.macys.com/shop/product/karen-kane-...,Karen Kane,Tie-Dyed Handkerchief-Hem Midi Dress,128.0,Tie Dye,10538864,A throwback look gets a chic update in this mi...,Karen Kane Tie-Dyed Handkerchief-Hem Midi Dres...


## 2.) Custom Recommendation System

In [39]:
# Input: df & list of indexs for items of confirmed customer interest
# Output: Series prepped with the items of interest at the end.

list_ids_interst = [2, 25, 405, 5098, 7000]
df.loc[list_ids_interst,['brand', 'title', 'Web_ID']]

Unnamed: 0,brand,title,Web_ID
2,Karen Kane,Tie-Dyed Handkerchief-Hem Midi Dress,10538864
25,CeCe,Mesh Ruffle-Trim Dress,10538764
405,Sequin Hearts,Juniors' Strappy-Back Glitter Gown,10500760
5098,Tommy Hilfiger,Ruched Metallic Sheath Dress,10280828
7000,Tommy Hilfiger,Textured Sheath Dress,8947748


In [40]:
df_sans_interest = df.drop(index = list_ids_interst)

# Adding all the words in the list of interested items into one doc
str_confirmed_interest = ' '.join(df.loc[list_ids_interst, 'all_words'])

# Reattaching the doc to the end of the appropriate series
temp_series = df_sans_interest['all_words'
    ].append(pd.Series(str_confirmed_interest, name = 'test_target'))
temp_series.tail(3)

7905    J Kara Beaded Gown with Shawl Silver/White A p...
7906    Alex Evenings Skirt, Tiered Chiffon Midi Black...
0       Karen Kane Tie-Dyed Handkerchief-Hem Midi Dres...
dtype: object

In [41]:
# Doing the maths
tfidf = tf.fit_transform(temp_series)
cosine_similarities = linear_kernel(tfidf, tfidf)
#cosine_similarities[0:3]

In [42]:
# Organizing the data for ONE SPECIFIC ROW
#row = temp_series.iloc[-1]
idx = temp_series.shape[0] - 1
similar_indices = cosine_similarities[idx].argsort()[:-100:-1] # Returns values in order of most -> least.
similar_items = [(cosine_similarities[idx][i], 
                  df.index[i]) for i in similar_indices]  # CHECK THIS 'Index'-based code
closest = similar_items[1:]
closest[0:3]

[(0.5321020749866268, 6396),
 (0.47775396835683764, 5089),
 (0.4473857529241537, 3813)]

In [43]:
print("Recommended Dresses")
list_o_dicts = []
for i in closest[:5]:
    list_o_dicts.append({
        'Brand'      : df.loc[i[1],'brand'],
        'Title'      : df.loc[i[1],'title'],
        'Web_ID'     : df.loc[i[1],'Web_ID'],
        'Similarity' : np.round(i[0],3) 
    })
pd.DataFrame(list_o_dicts, index=range(1,6))

Recommended Dresses


Unnamed: 0,Brand,Title,Web_ID,Similarity
1,Blondie Nites,Juniors' Strapless Sweetheart Dress,9486220,0.532
2,Calvin Klein,Plus Size Embellished Scuba Sheath Dress,10198892,0.478
3,Lauren Ralph Lauren,Crepe Fit-and-Flare Dress,10404311,0.447
4,B Darlin,Juniors' Lace & Scuba Fit & Flare Dress,10226691,0.431
5,BCBGeneration,Glossy Stretch Mini Dress,10287647,0.414


# 3.) Example Reccomemders

## 3.1 Copied via Heartbeat Code
The following code (while minorly editd) was originally copied from the below URL.  Much of it has influenced my above code because understanding this code is how I've learned how to make a reccomendation system.

https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831

In [114]:
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word) for word 
                     in tokens if len(word) > 1 and not word in stop_words])

tf = TfidfVectorizer(analyzer='word', 
                     #ngram_range=(1, 3), 
                     preprocessor = preprocess,
                     min_df=2,  # May take out for overfit
                     stop_words='english')

# Her original TF-IDF
#tf = TfidfVectorizer(analyzer='word', 
#                     ngram_range=(1, 3),
#                     min_df=0, 
#                     stop_words='english')

tfidf_matrix = tf.fit_transform(ds['description'])

In [116]:
def fun_matrix_to_results(tfidf_matrix):
    """Here we’ve calculated the cosine similarity of each item with every other 
    item in the dataset, and then arranged them according to their similarity with 
    item i, and stored the values in results."""
    
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) # Compare All.
    results = {}
    for idx, row in ds.iterrows():
        similar_indices = cosine_similarities[idx
            ].argsort()[:-100:-1] 
        similar_items = [(cosine_similarities[idx
            ][i], ds['id'][i]) for i in similar_indices] # 
        results[row['id']] = similar_items[1:]
    return results
results = fun_matrix_to_results(tfidf_matrix)

In [117]:
# Read and returns info from Original Dataframe
def item(id): 
    return ds.loc[ds['id'] == id]['description'
                                 ].tolist()[0].split(' - ')[0] 

def fun_org_recommend(item_id, num):
    item_id = ds.iloc[(item_id - 1), 1].split(' - ')[0]
    print("Recommending " + str(num) + " products similar to " 
          + item(item_id) + "...")   
    print("-------")    
    recs = results[item_id][:num]   
    for rec in recs: 
        print("Recommended: " + item(rec[1]) + 
              " (score:" + str(rec[0]) + ")")
    return

## 3.2 Copied Datacamp Code 
The following code (while minorly editd) was originally copied from the below URL.  This code has influenced very little of my own custom code, but is left here for referense.

https://www.datacamp.com/community/tutorials/recommender-systems-python

In [10]:
#df = ps.read_csv('')
#df['overview'] == df['all_words']
from sklearn.feature_extraction.text import TfidfVectorizer
tiffy = TfidfVectorizer(stop_words = 'english')
tiffy_matrix = tiffy.fit_transform(df['all_words'])

In [22]:
#Import linear_kernal to compute the dot product
from sklearn.metrics.pairwise import linear_kernel
#Compute cosine similarity matrix
cosine_sim = linear_kernel(tiffy_matrix, tiffy_matrix) 
#constructreverse mapping indices & [movie titles] drop duplicate [titles], if any.
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [33]:
#Function that takes in movei title as input and give reccommendations
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    #obtainthe index of the movie that matches the title
    idx = indices[title]
    #Getthe pairwise similarity scores of all movies with that movie
    #Convertit into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))
    #sortthe movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key = lambda x:  x[1], reverse = True)
    #Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]
    #Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    #Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

content_recommender("Scuba Pencil Skirt, Created for Macy's")

##Metadata Based Recommender
#Load the keywords and credit files

685               Lace Pencil Skirt, Created For Macy's
118             Printed Scuba Skirt, Created For Macy's
93       Printed Scuba Pencil Skirt, Created For Macy's
28                             Scuba Crepe Pencil Skirt
139       Zipper Pencil Scuba Skirt, Created for Macy's
3      INC Solid Scuba Pencil Skirt, Created for Macy's
147       Zipper Pencil Scuba Skirt, Created for Macy's
356       Zipper Pencil Scuba Skirt, Created for Macy's
42       Zipper Printed Scuba Skirt, Created for Macy's
9                Denim Pencil Skirt, Created for Macy's
Name: title, dtype: object