In [45]:
import pandas as pd
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import re
from unidecode import unidecode
import spacy
import en_core_web_md
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer





In [46]:
d=pd.read_csv('outfit_combinations.csv',encoding='UTF-8')


In [47]:
 d=d.apply(lambda x: x.astype(str).str.lower())

In [48]:
#Generating brand tokens

d['brand'] = d['brand'].apply(unidecode) ## converts accented characters to normal characters. 
                                         ## Otherwise this would create problem in similarity calc
regex = re.compile('[^a-zA-Z\s]')
d['brand'] = d['brand'].str.replace(regex,'')
brands=[]
for value in d.brand:
    for value in re.findall(r"[\w']+", value):
        brands.append(value.lower())
brands=set(brands)


In [49]:
#Generating outfit type tokens

d['outfit_item_type']=d['outfit_item_type'].str.replace(r'\d+','')
outfit_types=[]
for value in d.outfit_item_type:
    for value in re.findall(r"[\w']+", value):
        outfit_types.append(value.lower())
outfit_types=set(outfit_types)


In [50]:
#Generating product_full_name tokens
d['product_full_name'] = d['product_full_name'].apply(unidecode) ## converts accented characters to normal characters. 
                                         ## Otherwise this would create problem in similarity calc
d['product_full_name'] = d['product_full_name'].str.replace(regex,'')
product_full_names=[]
for value in d.product_full_name:
    for value in re.findall(r"[\w']+", value):
        product_full_names.append(value.lower())
product_full_names=set(product_full_names)


In [51]:
d['oi_len']=d['outfit_id'].apply(len)
d.oi_len.value_counts()
#From this we can figure out that if the query is of 26 characters or more and is a alphanumeric string
#then we can assume that its a product ID 

26    5291
Name: oi_len, dtype: int64

In [52]:
#Using basic FuzzyWuzzy matching for product ID
def returnFuzzyMatches(query,choices):
        result=process.extractOne(query, choices,scorer=fuzz.token_sort_ratio)
        score=result[1]
        matches=[]
        if score==100: ## perfect match, then
            matches.append(result[0])  
        else:
            list_of_matches=process.extract(query, choices, limit=5)
            for match in list_of_matches:
                matches.append(match[0])
        return matches
                

In [53]:
#remove special characters,punctuations,spaces. Concat all the words in the query string
#This takes care of typos in the first place. Also we only need either numbers or characters 
#for any searching purpose
def refinedQueryString(query):
    return ''.join(e for e in query if e.isalnum())
    

In [54]:
def getCombinationsByOutfitIds(ids,d):
    combinations={}
    for id in ids:
        combinations[id]=d.loc[d['outfit_id'] == id]
    return combinations



In [55]:
def printCombinations(combinations):
    for outfidId,combination in combinations.items():
        print('For Outfit ID: ',outfidId)
        for index,row in combination.iterrows():
            print(row.outfit_item_type,':',row.product_full_name,'(',row.product_id,')')
        print('\n')

In [56]:
def getMatchedOutfitsByProductIds(ids,d):
    matchedOutfits=[]
    for id in ids:
        matchedOutfits.append(str(d.loc[d.product_id==id]['outfit_id'].values[0]))
    return matchedOutfits

In [57]:
def getDfByBrand(query_tokens,filtered_df):
    brand_match=brands.intersection(set(query_tokens))
    brand_match_df=filtered_df.loc[:, 'brand'].str.contains(r'\b(?:{})\b'.format('|'.join(list(brand_match))))
    return filtered_df[brand_match_df]

In [58]:
#this function tokenizes the query. Checks if any token matches any brand tokens, 
#if yes, then filter the dataset by brand.
#then checks for outfit_type
#if yes, then filter the dataset by outfit_type
def findSimilarProducts(query,d):
    query_tokens=[]
    filtered_df=d.copy()
    for value in re.findall(r"[\w']+", query):
        query_tokens.append(value.lower())
    dfByBrand=getDfByBrand(query_tokens,filtered_df)
    if(len(dfByBrand)>0):
        filtered_df=dfByBrand.copy()
        
    ##concatenate all the columns
    filtered_df=filtered_df.astype(str).reset_index()
    concat_filtered_df=filtered_df.iloc[:, 2:-1].apply(' '.join, 1).reset_index().drop(columns=['index'])
    concat_filtered_df.columns=['concatenated_columns']
    dataset_list=concat_filtered_df.concatenated_columns.tolist()
   
    ## vectorize the dataset and the get word embeddings
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(dataset_list)
    dataset_vectors = getVectors(dataset_list,X,vectorizer)

    ## vectorize the query
    query_vectors = getVectors([query],X,vectorizer)
    query_vectors = [list(query_vectors[0])]
    
    ##find the similarity
    sim_score = []
    for i in range(filtered_df.shape[0]):
        dataset_vectors[i] = list(dataset_vectors[i])
        score =  float(cosine_similarity([dataset_vectors[i]],query_vectors))
        sim_score.append(score)

    max_row = sim_score.index(max(sim_score))
    return filtered_df.loc[max_row,"product_id"]
    #TO:DO add outfit_type filter
    
    

In [59]:
def getVectors(data_list,X,vectorizer):
    nlp = en_core_web_md.load()
    X = X.transform(data_list)
    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"

    # sum the tf idf scores for each document
    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    available_tf_idf_scores = tf_idf_lookup_table.columns # a list of all the columns we have
    available_tf_idf_scores = list(map( lambda x: x.lower(), available_tf_idf_scores)) # lowercase everything
    
    import numpy as np

    vectors = []
    for idx, doc in enumerate(data_list): # iterate through each review
        tokens = nlp(doc) # have spacy tokenize the review text

        # initially start a running total of tf-idf scores for a document
        total_tf_idf_score_per_document = 0

        # start a running total of initially all zeroes (300 is picked since that is the word embedding size used by word2vec)
        running_total_word_embedding = np.zeros(300) 
        for token in tokens: # iterate through each token

        # if the token has a pretrained word embedding it also has a tf-idf score
            if token.has_vector and token.text.lower() in available_tf_idf_scores:

                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text.lower()]
                #print(f"{token} has tf-idf score of {tf_idf_lookup_table.loc[idx, token.text.lower()]}")
                running_total_word_embedding += tf_idf_score * token.vector

                total_tf_idf_score_per_document += tf_idf_score

        # divide the total embedding by the total tf-idf score for each document
        document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        vectors.append(document_embedding)
    return vectors
    




In [60]:
def recommendOutfit(query,d):
    refineQueryString=''
    refineQueryString=refinedQueryString(query)
    if refineQueryString.isalnum()==True and len(refineQueryString) >= 26:
        #Look for product Id
        df=d.iloc[:, 1].apply(''.join, 1).reset_index()
        df.drop(columns=['index'],inplace=True)
        df.columns=['X']
        choices = df.X.tolist()
        productIDs=returnFuzzyMatches(query,choices)
        matchedOutfitIds=set(getMatchedOutfitsByProductIds(productIDs,d))
        combinations=getCombinationsByOutfitIds(matchedOutfitIds,d)
        printCombinations(combinations)
    else:
        matchedOutfitIds=set(getMatchedOutfitsByProductIds([findSimilarProducts(query,d)],d))
        combinations=getCombinationsByOutfitIds(matchedOutfitIds,d)
        printCombinations(combinations)
        

In [42]:
query=input('input Product ID or any text')

input Product ID or any textfenimore double buckle shoes


In [43]:
query

'fenimore double buckle shoes'

In [44]:
recommendOutfit(query,d)

For Outfit ID:  01e6mc52dnja6twpz2v4pb4dps
shoe : fenimore triple buckle boot ( 01e2p0sjskfknqj5svq8md1jzt )
onepiece : darcy dress ( 01e4ehhmc6yp74e9j8qv3fr4cw )
accessory : leather circle crossbody bag ( 01e5zs3r9jd696ywgk9nsg56e1 )
accessory : mm gradient oversize square sunglasses ( 01e5zyhza7186dvwej99q4d2pm )


