In [4]:
import pandas as pd

In [5]:
images_csv_path = '/content/drive/MyDrive/datasets/images.csv'
styles_csv_path = '/content/drive/MyDrive/datasets/styles.csv'

In [6]:
# Read the CSV files into pandas DataFrames and handle lines with more than ten columns
images_df = pd.read_csv(images_csv_path)
styles_df = pd.read_csv(styles_csv_path, warn_bad_lines=True, error_bad_lines=False)

# Remove rows from both DataFrames when styles_df has more than ten columns
styles_df = styles_df.dropna(thresh=10)

# Ensure 'filename' column in images_df is of the same data type as 'id' column in styles_df
images_df['filename'] = images_df['filename'].str.replace('.jpg', '').astype(int)

# Find the common IDs between images_df and styles_df based on the 'filename' and 'id' columns
common_ids = set(images_df['filename']) & set(styles_df['id'])

# Filter images_df to only keep rows with common IDs
images_df = images_df[images_df['filename'].isin(common_ids)]

# Filter styles_df to only keep rows with common IDs
styles_df = styles_df[styles_df['id'].isin(common_ids)]

# Reset the index of both DataFrames after removing rows
images_df = images_df.reset_index(drop=True)
styles_df = styles_df.reset_index(drop=True)

# Display the first few rows of both DataFrames to check the data
print("\nImages DataFrame:")
print(images_df.head())

print("\nStyles DataFrame:")
print(styles_df.head())




  styles_df = pd.read_csv(styles_csv_path, warn_bad_lines=True, error_bad_lines=False)


  styles_df = pd.read_csv(styles_csv_path, warn_bad_lines=True, error_bad_lines=False)
Skipping line 6044: expected 10 fields, saw 11
Skipping line 6569: expected 10 fields, saw 11
Skipping line 7399: expected 10 fields, saw 11
Skipping line 7939: expected 10 fields, saw 11
Skipping line 9026: expected 10 fields, saw 11
Skipping line 10264: expected 10 fields, saw 11
Skipping line 10427: expected 10 fields, saw 11
Skipping line 10905: expected 10 fields, saw 11
Skipping line 11373: expected 10 fields, saw 11
Skipping line 11945: expected 10 fields, saw 11
Skipping line 14112: expected 10 fields, saw 11
Skipping line 14532: expected 10 fields, saw 11
Skipping line 15076: expected 10 fields, saw 12
Skipping line 29906: expected 10 fields, saw 11
Skipping line 31625: expected 10 fields, saw 11
Skipping line 33020: expected 10 fields, saw 11
Skipping line 35748: expected 10 fields, saw 11
Skipping li


Images DataFrame:
   filename                                               link
0     15970  http://assets.myntassets.com/v1/images/style/p...
1     39386  http://assets.myntassets.com/v1/images/style/p...
2     59263  http://assets.myntassets.com/v1/images/style/p...
3     21379  http://assets.myntassets.com/v1/images/style/p...
4     53759  http://assets.myntassets.com/v1/images/style/p...

Styles DataFrame:
      id gender masterCategory subCategory  articleType baseColour  season  \
0  15970    Men        Apparel     Topwear       Shirts  Navy Blue    Fall   
1  39386    Men        Apparel  Bottomwear        Jeans       Blue  Summer   
2  59263  Women    Accessories     Watches      Watches     Silver  Winter   
3  21379    Men        Apparel  Bottomwear  Track Pants      Black    Fall   
4  53759    Men        Apparel     Topwear      Tshirts       Grey  Summer   

     year   usage                             productDisplayName  
0  2011.0  Casual               Turtle Check Men

In [7]:
# Merge the two DataFrames based on 'filename' and 'id' columns and make it the new 'id' column
merged_df = pd.merge(images_df, styles_df, left_on='filename', right_on='id')

# Drop the redundant 'filename' column from the merged DataFrame
merged_df = merged_df.drop(columns=['filename'])

# Rename the merged column as 'id'
merged_df = merged_df.rename(columns={'id_x': 'id'})

# Display the first few rows of the merged DataFrame to check the data
print("\nMerged DataFrame:")
print(merged_df.head())


Merged DataFrame:
                                                link     id gender  \
0  http://assets.myntassets.com/v1/images/style/p...  15970    Men   
1  http://assets.myntassets.com/v1/images/style/p...  39386    Men   
2  http://assets.myntassets.com/v1/images/style/p...  59263  Women   
3  http://assets.myntassets.com/v1/images/style/p...  21379    Men   
4  http://assets.myntassets.com/v1/images/style/p...  53759    Men   

  masterCategory subCategory  articleType baseColour  season    year   usage  \
0        Apparel     Topwear       Shirts  Navy Blue    Fall  2011.0  Casual   
1        Apparel  Bottomwear        Jeans       Blue  Summer  2012.0  Casual   
2    Accessories     Watches      Watches     Silver  Winter  2016.0  Casual   
3        Apparel  Bottomwear  Track Pants      Black    Fall  2011.0  Casual   
4        Apparel     Topwear      Tshirts       Grey  Summer  2012.0  Casual   

                              productDisplayName  
0               Turtle Check

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
missing_values = merged_df.isnull().sum()
print(missing_values)

link                  0
id                    0
gender                0
masterCategory        0
subCategory           0
articleType           0
baseColour            0
season                0
year                  0
usage                 0
productDisplayName    0
dtype: int64


In [10]:
# Assuming you have already read the data into the 'merged_df' DataFrame after merging

# Check the number of unique productDisplayName
num_unique_product_display_names = merged_df['productDisplayName'].nunique()

# Display the result
print("Number of unique id:", num_unique_product_display_names)


Number of unique id: 30806


In [11]:
# Assuming you have already read the data into the 'merged_df' DataFrame after merging

# Check the number of rows in the DataFrame
num_rows = merged_df.shape[0]

# Display the result
print("Number of rows in the DataFrame:", num_rows)


Number of rows in the DataFrame: 44077


In [12]:
# Assuming you have already read the data into the 'merged_df' DataFrame after merging

# Check the number of unique 'link' values
num_unique_links = merged_df['link'].nunique()

# Check the number of rows in the DataFrame
num_rows = merged_df.shape[0]

# Remove rows with repeated 'link' values and keep only the first occurrence
merged_df = merged_df.drop_duplicates(subset='link', keep='first')

# Check the number of rows in the DataFrame after removing duplicates
num_rows_after_removal = merged_df.shape[0]

# Display the updated number of unique 'link' values and rows in the DataFrame
print("Number of unique 'link' values before removal:", num_unique_links)
print("Number of rows in the DataFrame before removal:", num_rows)

print("Number of unique 'link' values after removal:", merged_df['link'].nunique())
print("Number of rows in the DataFrame after removal:", num_rows_after_removal)


Number of unique 'link' values before removal: 44071
Number of rows in the DataFrame before removal: 44077
Number of unique 'link' values after removal: 44071
Number of rows in the DataFrame after removal: 44071


In [13]:
num_rows = merged_df.shape[0]

# Display the result
print("Number of rows in the DataFrame:", num_rows)

Number of rows in the DataFrame: 44071


In [14]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [43]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import faiss

# Assuming merged_df is your DataFrame with the provided columns
merged_df['companyName'] = merged_df['productDisplayName'].str.split().str[0]

# Combine text features into a single string for TF-IDF vectorization
merged_df['combined_features'] = merged_df['articleType'] + ' ' + merged_df['baseColour'] + ' ' + merged_df['season'] + ' ' + merged_df['year'].astype(str) + ' ' + merged_df['usage'] + ' ' + merged_df['companyName']

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['combined_features'])

# Dimensionality Reduction with Truncated SVD
num_components = 100  # Adjust the number of components based on your memory availability
svd = TruncatedSVD(n_components=num_components)
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

# Indexing with Faiss
index = faiss.IndexFlatIP(num_components)  # Index with Inner Product (cosine similarity)
index.add(reduced_tfidf_matrix)

def get_recommendations(product_name, index, k=6):
    # Get the index of the product
    product_idx = merged_df.index[merged_df['productDisplayName'] == product_name].tolist()[0]

    # Get the query vector
    query_vector = reduced_tfidf_matrix[product_idx]

    # Find the approximate nearest neighbors (including the query itself)
    _, nn_indices = index.search(query_vector.reshape(1, -1), k)

    # Get the product display names and IDs of the recommended products
    recommended_products = [{'name': merged_df.iloc[idx]['productDisplayName'], 'id': merged_df.iloc[idx]['id']} for idx in nn_indices[0][1:]]

    return recommended_products

# Test and Evaluate
# Example usage:
input_product_name = "Titan Women Silver Watch"  # Assuming you want recommendations for this product
recommendations = get_recommendations(input_product_name, index)

# Print the recommended product display names
print(recommendations)


[{'name': 'Titan Men White Dial Watch', 'id': 31835}, {'name': 'Titan Women Silver Dial Watch', 'id': 40543}, {'name': 'Titan Men Silver Watch', 'id': 59236}, {'name': 'Titan Women Silver Watch', 'id': 59262}, {'name': 'Titan Women Silver Watch', 'id': 59263}]


In [44]:
input_product_name = "Turtle Check Men Navy Blue Shirt"  # Assuming you want recommendations for this product
recommendations = get_recommendations(input_product_name, index)

# Print the recommended product display names
print(recommendations)

[{'name': 'U.S. Polo Assn. Men Checks Navy Blue Shirt', 'id': 17158}, {'name': 'U.S. Polo Assn. Men Checks Navy Blue Shirt', 'id': 17170}, {'name': 'U.S. Polo Assn. Men Check Navy Blue Shirt', 'id': 19824}, {'name': 'U.S. Polo Assn. Men Striped Navy Blue Shirt', 'id': 19846}, {'name': 'U.S. Polo Assn. Men Check Navy Blue Shirt', 'id': 19830}]


In [48]:
def get_unique_recommendations(product_name, index, k=10):
    # Get more recommendations using the get_recommendations function
    recommendations = get_recommendations(product_name, index, k)

    # Filter out the input product itself (if it's in the recommendations)
    recommendations = [rec for rec in recommendations if rec['name'] != product_name]

    # Initialize a set to store unique recommendations
    unique_recommendations_set = set()
    unique_recommendations = []

    for rec in recommendations:
        # Check if the recommendation is not the same as the input product and not already in the unique recommendations
        if rec['name'] != product_name and rec['name'] not in unique_recommendations_set:
            unique_recommendations_set.add(rec['name'])
            unique_recommendations.append(rec)

        # Check if we have enough unique recommendations (at least 5)
        if len(unique_recommendations) >= 5:
            break

    return unique_recommendations[:5]

unique_recommendations = get_unique_recommendations(input_product_name, index)
unique_recommendations1 = get_unique_recommendations("Titan Women Silver Watch", index)
print("\nUnique Recommendations (Up to 5):")
print(unique_recommendations)
print("\nUnique Recommendations (Up to 5):")
print(unique_recommendations1)


Unique Recommendations (Up to 5):
[{'name': 'U.S. Polo Assn. Men Checks Navy Blue Shirt', 'id': 17158}, {'name': 'U.S. Polo Assn. Men Check Navy Blue Shirt', 'id': 19824}, {'name': 'U.S. Polo Assn. Men Striped Navy Blue Shirt', 'id': 19846}, {'name': 'Puma Men Motorsport Navy Blue Shirts', 'id': 10051}, {'name': 'United Colors of Benetton Women Check Navy Blue Shirts', 'id': 16250}]

Unique Recommendations (Up to 5):
[{'name': 'Titan Men White Dial Watch', 'id': 31826}, {'name': 'Titan Men Chronograph Silver-Toned Dial Watch NA9322SL02', 'id': 31860}, {'name': 'Titan Women Raga Silver Dial Watch', 'id': 31866}, {'name': 'Titan Women Silver Dial Watch', 'id': 40543}, {'name': 'Titan Men Silver Watch', 'id': 59236}]


In [37]:
def recommend_by_category(category, index, k=6):
    # Get the index of products with the specified category
    product_indices = merged_df[merged_df['articleType'] == category].index.tolist()

    # Find the approximate nearest neighbors for products in the specified category
    _, nn_indices = index.search(reduced_tfidf_matrix[product_indices], k)

    # Flatten the results and remove duplicates
    recommended_product_indices = set(nn_indices.flatten())

    # Get the product display names of the recommended products
    recommended_product_names = [merged_df.iloc[idx]['productDisplayName'] for idx in recommended_product_indices]

    # Filter out the products that are not of the specified category
    recommended_product_names = [name for name in recommended_product_names if merged_df.loc[merged_df['productDisplayName'] == name, 'articleType'].values[0] == category]

    return recommended_product_names[:5]

catrec = recommend_by_category("Jeans", index)
print(catrec)

['Peter England Men Party Blue Jeans', 'Wrangler Men Blue Texas Jeans', 'Jealous 21 Women Black Jeans', 'Puma Women Navy Blue Jeans', 'Wrangler Men Blue Millard Jeans']


In [40]:
import random
def recommend_random_products(index, k=6):
    # Get a random sample of product indices
    random_indices = random.sample(range(len(merged_df)), k)

    # Find the approximate nearest neighbors for the randomly selected products
    _, nn_indices = index.search(reduced_tfidf_matrix[random_indices], k)

    # Flatten the results and remove duplicates
    recommended_product_indices = set(nn_indices.flatten())

    # Get the product display names of the recommended products
    recommended_product_names = [merged_df.iloc[idx]['productDisplayName'] for idx in recommended_product_indices]

    return recommended_product_names

In [42]:
random_recommendations = recommend_random_products(index)
print(random_recommendations)

['ADIDAS Originals Men Vespa PK LO Brown Casual Shoes', 'SDL by Sweet Dreams Men Grey & Red Pyjama Set', 'Wrangler Women Blue Molly Jeans', 'Wrangler Women Blue Giselle Jeans', 'United Colors of Benetton Men Solid DK.Grey Shirts', 'U.S. Polo Assn. Men Striped Blue Shirt', 'ADIDAS Men Daroga Two 11 Lea Brown Casual Shoes', 'ADIDAS Originals Men Court Lounge Brown Casual Shoes', 'U.S. Polo Assn. Men Checks Blue Shirt', 'ADIDAS Men Brown Shoes', 'Tantra Unisex Printed Red Tshirts', 'Wrangler Women Blue Molly Jeans', 'SDL by Sweet Dreams Men Grey & Navy Blue Pyjama Set', 'United Colors of Benetton Men Grey Shirt', 'SDL by Sweet Dreams Men Grey Pyjama Set', 'Wrangler Women Blue Molly Jeans', 'ADIDAS Brown Agora Lea Casual Shoe', 'United Colors of Benetton Men Stripes Grey Shirts', 'SDL by Sweet Dreams Men Grey & Blue Pyjama Set', 'United Colors of Benetton Men Stripes Grey Shirts', 'United Colors of Benetton Men Solid Grey Shirts', 'SDL by Sweet Dreams Men Grey & Navy Blue Pyjama Set', 'U.S