In [6]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
originalData = pd.read_csv("C:\\Users\\sures\\Desktop\\clothingData.csv")

In [8]:
originalData.shape

(44446, 10)

## setting up the product data

In [9]:
df = originalData.dropna()

In [10]:
df= df.drop(['gender','year','usage','season'],axis=1)

In [11]:
df.head()

Unnamed: 0,id,masterCategory,subCategory,articleType,baseColour,productName
0,15970,Apparel,Topwear,Shirts,Navy Blue,Turtle Check Men Navy Blue Shirt
1,39386,Apparel,Bottomwear,Jeans,Blue,Peter England Men Party Blue Jeans
2,59263,Accessories,Watches,Watches,Silver,Titan Women Silver Watch
3,21379,Apparel,Bottomwear,Track Pants,Black,Manchester United Men Solid Black Track Pants
4,53759,Apparel,Topwear,Tshirts,Grey,Puma Men Grey T-shirt


In [12]:
duplicate = df[df['productName'].duplicated()]
duplicate

Unnamed: 0,id,masterCategory,subCategory,articleType,baseColour,productName
186,21948,Accessories,Bags,Handbags,Brown,Murcia Women Casual Brown Handbag
230,45603,Footwear,Shoes,Formal Shoes,Black,Arrow Men Black Formal Shoes
349,41861,Footwear,Sandal,Sandals,Black,Estd. 1977 Men Black Sandals
373,58726,Apparel,Topwear,Tshirts,Grey,Puma Men Grey T-shirt
392,16950,Accessories,Eyewear,Sunglasses,Brown,Image Men Sunglasses
...,...,...,...,...,...,...
44427,37431,Accessories,Bags,Handbags,Black,Murcia Women Black Handbag
44432,38479,Apparel,Topwear,Tshirts,Green,Gini and Jony Boys Core Green T-shirt
44433,33091,Footwear,Shoes,Heels,Gold,Catwalk Women Gold Flats
44440,42234,Apparel,Topwear,Tops,Blue,Sepia Women Blue Printed Top


In [13]:
df.shape

(44099, 6)

In [14]:
# finding duplicate products
duplicates = df['productName'].duplicated(keep='first')
duplicates.sum()

13279

In [15]:
# selecting on those rows which has no duplicate productNames
result = df[~duplicates]
result.shape

(30820, 6)

In [16]:
df = result.head(10000)

## setting up the image data

In [18]:
imgData = pd.read_csv("C:\\Users\\sures\\Desktop\\images.csv")
imgData.head(2)

Unnamed: 0,filename,productName,link
0,15970.jpg,Turtle Check Men Navy Blue Shirt,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,Peter England Men Party Blue Jeans,http://assets.myntassets.com/v1/images/style/p...


In [19]:
imgData = imgData.drop_duplicates(subset='productName')

In [20]:
imgData = imgData[['link', 'productName']]
result = result[['productName']]

In [21]:
finalImageData = imgData.merge(result, on='productName', how='inner')
finalImageData

Unnamed: 0,link,productName
0,http://assets.myntassets.com/v1/images/style/p...,Turtle Check Men Navy Blue Shirt
1,http://assets.myntassets.com/v1/images/style/p...,Peter England Men Party Blue Jeans
2,http://assets.myntassets.com/v1/images/style/p...,Titan Women Silver Watch
3,http://assets.myntassets.com/v1/images/style/p...,Manchester United Men Solid Black Track Pants
4,http://assets.myntassets.com/v1/images/style/p...,Puma Men Grey T-shirt
...,...,...
30815,http://assets.myntassets.com/v1/images/style/p...,Tantra Women Printed Peach T-shirt
30816,http://assets.myntassets.com/v1/images/style/p...,Gas Men Caddy Casual Shoe
30817,http://assets.myntassets.com/v1/images/style/p...,Lotto Men's Soccer Track Flip Flop
30818,http://assets.myntassets.com/v1/images/style/p...,Puma Men Graphic Stellar Blue Tshirt


In [72]:
df.head(2)

Unnamed: 0,id,masterCategory,subCategory,articleType,baseColour,productName,text
0,15970,Apparel,Topwear,Shirts,Navy Blue,Turtle Check Men Navy Blue Shirt,Navy Blue Shirts
1,39386,Apparel,Bottomwear,Jeans,Blue,Peter England Men Party Blue Jeans,Blue Jeans


In [64]:
# Create a text feature by combining multiple columns
df["text"] = df['baseColour'] + ' ' + df['articleType']

In [70]:
# TF-IDF vectorization 
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [71]:
tfidf_matrix

<10000x196 sparse matrix of type '<class 'numpy.float64'>'
	with 23130 stored elements in Compressed Sparse Row format>

In [66]:
cosine_sim[0]

array([1.        , 0.21559707, 0.        , ..., 0.        , 0.        ,
       0.        ])

## content based filtering

In [67]:
def get_recommendations(product_name, article_type, base_color, cosine_sim=cosine_sim):
    # Create a text combining product name, article type, and base color
    text = f'{base_color} {article_type} {product_name}' # blue shirt puma-shirt
                                                        # blue shirt

    # Get the indices for items matching the text
    matching_indices = df[df["text"] == text].index  

    # Initialize an empty DataFrame to store similar items
    similar_items = pd.DataFrame(columns=['productName'])

    # Check for similar items based on article type
    for index in matching_indices:
        similar_items = similar_items.append(
            df[df['articleType'] == df.at[index, 'articleType']][['productName']], ignore_index=True)

    # Check for similar items based on base color
    for index in matching_indices:
        similar_items = similar_items.append(
            df[df['baseColour'] == df.at[index, 'baseColour']][['productName']], ignore_index=True)

    # If no matches are found based on article type or base color, search for similar product names
    if similar_items.empty:
        # Use text similarity or other matching techniques to find similar items
        product_tokens = product_name.lower().split()

        # Initialize a list to store matching product names
        matching_product_names = []

        # Loop through each product name in the dataset
        for index, row in df.iterrows():
            name_tokens = row['productName'].lower().split()

            # Calculate Jaccard similarity or other similarity metrics
            intersection = len(set(product_tokens) & set(name_tokens))
            union = len(set(product_tokens) | set(name_tokens))

            # Adjust the similarity threshold as needed
            similarity = intersection / union

            # If the similarity is above a threshold and it's not the input product, consider it a match
            if similarity > 0.5 and row['productName'] != product_name:  # Adjust the threshold as needed
                matching_product_names.append(row['productName'])

        # If matching products are found, return them as recommendations
        if matching_product_names:
            return matching_product_names

        # If no similar items are found, return a message indicating no recommendations
        return ["No recommendations found."]

    # Remove the input product name from the list of similar items
    similar_items = similar_items[similar_items['productName'] != product_name]

    # Return the top 10 unique product names as recommendations
    return similar_items['productName'].unique()[:10]


In [68]:
product_name = "Titan Women Silver Watch"
article_type = "Watches"
base_colour = "Silver"

recommended_items = get_recommendations(product_name, article_type, base_colour)

# Remove duplicates by converting the list to a set and back to a list
recommended_items = list(set(recommended_items))

# Print the recommended items
print("Recommended Items:")
for i, item in enumerate(recommended_items):
    if i >= 10:
        break
    print(i,item)

Recommended Items:
0 Titan Men Silver Watch
1 Titan Women White Watch
2 Titan Women Raga Silver Dial Watch
3 Titan Women Gold Watch
4 Titan Women Silver Dial Watch


## KNN based filtering

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Function to get k-NN recommendations
def knn_recommender(user_product_name, article_type, base_colour, k=10):
    # Create a feature vector for the input product
    input_product = f'{base_colour} {article_type} {user_product_name}'
    
    # Fit a k-NN model on the TF-IDF matrix
    knn = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute')
    knn.fit(tfidf_matrix)
    
    # Transform the input product to TF-IDF representation
    input_tfidf = tfidf.transform([input_product])
    
    # Find the k-NN for the input product
    distances, indices = knn.kneighbors(input_tfidf, n_neighbors=k)
    
    # Get the recommended product indices
    recommended_indices = indices[0]
    
    # Get the recommended product names
    recommended_products = df['productName'].iloc[recommended_indices]
    
    return recommended_products


In [28]:
product_name = "Titan Women Silver Watch"
article_type = "Watches"
base_colour = "Silver"


recommended_items = knn_recommender(product_name, article_type, base_colour)

# Print the recommended items
print("Recommended Items:")
for item in recommended_items:
    print(item)

Recommended Items:
Fossil Men Silver-Toned Dial Chronograph Watch CH2592
Fastrack Women Silver Dial Watch
Nautica Men Silver Dial Watch
Fastrack Women Silver Dial Casual Watch
Aspen Women Silver Dial Watch
Fossil Men Silver-Toned Dial Chronograph Watch FS4643
Tommy Hilfiger Women Silver-Toned Dial  Watch TH1780453-D
CASIO SHEEN Women Silver-Toned Dial Watch SX042
Maxima Women Silver Dial Watch
Casio Enticer Men Silver-Gold Analogue Watch MTP-1235SG-7ADF(A359)


In [73]:
def get_combined_recommendations(user_product_name, article_type, base_color, k=10):
    # Get recommendations using content-based filtering
    content_recommendations = get_recommendations(user_product_name, article_type, base_color, cosine_sim=cosine_sim)
    
    # Calculate how many recommendations are needed from k-NN to reach the total of 10
    k_nn_needed = max(k - len(content_recommendations), 1)  # Ensure k_nn_needed is at least 1
    
    # Get recommendations using k-NN
    knn_recommendations = knn_recommender(user_product_name, article_type, base_color, k=k_nn_needed)
    
    # Concatenate the results while ensuring a total of 10 recommendations
    combined_recommendations = list(content_recommendations) + list(knn_recommendations)
    
    return combined_recommendations


In [74]:
from IPython.display import Image, display

product_name = "Jealous 21 Women Purple Shirt"
article_type = "Shirts"
base_color = "Purple"

recommended_items = get_combined_recommendations(product_name, article_type, base_color)


for item in recommended_items:
    link_value = finalImageData.loc[finalImageData['productName'] == item, 'link'].values[0]

    print("Product Name:", item)

    display(Image(url=link_value, width=100, height=100))



Product Name: Jealous 21 Women Uaine Purple Tops


Product Name: Jealous 21 Women Teal Shirt


Product Name: Jealous 21 Women Check Purple Top


Product Name: Jealous 21 Women Check Purple Shirt


Product Name: Jealous 21 Women Pink Shirt


Product Name: Jealous 21 Women Check Blue Shirt


Product Name: Jealous 21 Women Blue Shirt


Product Name: Jealous 21 Women Check Red Shirt


Product Name: Jealous 21 Women Jealous Macrae Purple Top


Product Name: Jealous 21 Women Black Shirt


Product Name: Jealous 21 Women Stripes Purple Sweater


Product Name: Jealous 21 Women Checks Pink Shirt


Product Name: Jealous 21 Women Purple Shirt


In [32]:
# import pickle

In [33]:
# pickle.dump(finalImageData.to_dict(),open('productAndImagesDict.pkl','wb'))

In [34]:
# pickle.dump(finalImageData.to_dict(),open('productAndImagesDict.pkl','wb'))

In [35]:
# articleType_uniqueValues = df['articleType'].unique()
# baseColour_uniqueValues = df['baseColour'].unique()

In [36]:
# articleType_df = pd.DataFrame({'articleType': articleType_uniqueValues})
# baseColour_df = pd.DataFrame({'baseColour': baseColour_uniqueValues})

In [37]:
# pickle.dump(articleType_df.to_dict(),open('articleTypeDict.pkl','wb'))
# pickle.dump(baseColour_df.to_dict(),open('baseColourDict.pkl','wb'))