# Using myntra fashion product details to recommend 

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import pandas as pd 
import spacy 
import requests 
from bs4 import BeautifulSoup
from spacy import displacy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
db_path = "/kaggle/input/myntra-fashion-product-dataset/Fashion Dataset.csv"
db = pd.read_csv(db_path)

In [None]:
print("Columns:",db.columns)
print("Size of database:",db.shape)

In [None]:
db = db.rename(columns={'Unnamed: 0':'Index'})
db = db.set_index('Index')
db.head()

In [None]:
# treating null values 

db.isna().sum()
db = db.dropna(subset=['p_id'])
db = db.dropna(subset=['colour'])
db = db.fillna(0)
db['p_id'] = db['p_id'].astype(int)
db.isna().sum()

In [None]:
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)

In [None]:
# Remove stop words from item name 

def remove_stop_words(text):
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    return ' '.join(filtered_words)

docs = nlp.pipe(db['name'], batch_size=50)
mod_item_name = [remove_stop_words(doc) for doc in docs]

db['name'] = mod_item_name
db.head()

In [None]:
# Recommend from cosine similarity matrix 

def recommend_cosine(df, item_id):
    if item_id not in df['p_id'].values:
        print(f"Item with p_id {item_id} not found in the DataFrame.")
        return None
    item_index = df[df['p_id'] == item_id].index[0]
    similarity_scores = similarity_matrix[item_index]
    similar_indices = similarity_scores.argsort()[::-1][1:6]
    top_3_items = df.iloc[similar_indices]
    return top_3_items

In [None]:
item_name = db['name']
print('Number of null values:',item_name.isna().sum())
print('Shape of matrix',item_name.shape)
print('Number of unique product names:',item_name.nunique())

In [None]:
# Remove brand name from item name 

def remove_brand_from_product(row):
    brand_name = row['brand']
    product_name = row['name']
    
    if brand_name.lower() in product_name.lower():
        product_name = product_name.replace(brand_name, '').strip()
    return product_name

db['name'] = db.apply(remove_brand_from_product, axis=1)
db.head()

In [None]:
# Create new columns to store colors from each item 

import spacy
import webcolors
import pandas as pd

nlp = spacy.load("en_core_web_sm")

color_names = set(webcolors.CSS3_NAMES_TO_HEX.keys())
def extract_colors(text):
    words = text.split()
    colors = [word for word in words if word.lower() in color_names]
    return colors if colors else ['None']

db['color'] = db['name'].apply(extract_colors)
db['color'] = db['color'].apply(list_to_comma_separated)
db.head()

In [None]:
db = db.sort_values(by=['avg_rating','ratingCount'],ascending=[False,False])
db.head()

In [None]:
columns = ['']
db = db.drop(columns, axis=1)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
encoder_value = encoder.fit_transform(db[['brand','color','name','p_attributes','description']])
encoder_numeric = db[['price','avg_rating']].values
features_net = np.hstack((encoder_numeric,encoder_value))
similarity_matrix = cosine_similarity(features_net)

In [None]:
# Recommending top 3 products based on current product 

top3 = recommend_cosine(db,16200858)
print(top3['p_id'],top3['name'])

Drawing inferences from above data

In [None]:
# Top Rated Products

top_rated_products = db.sort_values(by='avg_rating', ascending=False).head(10)
print("Top Rated Products:\n", top_rated_products[['name', 'brand', 'avg_rating', 'price']])

plt.figure(figsize=(10, 6))
plt.barh(top_rated_products['name'], top_rated_products['avg_rating'], color='skyblue')
plt.xlabel('Average Rating')
plt.title('Top Rated Products')
plt.gca().invert_yaxis()  # Invert y-axis to display highest rating at the top
plt.show()

In [None]:
# High sale products 

high_sale_products = db.sort_values(by='ratingCount', ascending=False).head(10)
print("High Sale Products:\n", high_sale_products[['name', 'brand', 'ratingCount', 'price']])

In [None]:
# Average Rating by Brand

avg_rating_by_brand = data.groupby('brand')['avg_rating'].mean().sort_values(ascending=False).head(10)
print("Average Rating by Brand:\n", avg_rating_by_brand)

# Instagram dataset

- Data scraped from popular myntra run instagram pages 
- Extraction of number of likes per product to get a general trend 

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
insta_path = "/kaggle/input/instagram-posts/myntra_instagram_dataset.xlsx"

In [None]:
insta_df = pd.ExcelFile(insta_path)
sheet_names = insta_df.sheet_names
insta_db = pd.read_excel(insta_df, sheet_name="Data")
insta_db.head()
insta_db.columns

In [None]:
#Data cleaning 

columns = ['locationName','commentsCount','ownerUsername','productType']
insta_db = insta_db.drop(columns, axis=1)
insta_db.head()

In [None]:
# Remove null values 

insta_db = insta_db.dropna()
insta_db.isna().sum()

In [None]:
insta_db['product_id'] = None 
insta_db.head()

In [None]:
# Extract product_id per record from caption

import re 

new_rows = []
length = insta_db.shape[0]
for i in range(length):
    string = insta_db.iloc[i,1]
    pattern = r'\b\d{6,}\b'
    product_code = re.findall(pattern, string)
    product_codes_str = ', '.join(product_code)
    insta_db.iloc[i,3] = product_codes_str

In [None]:
columns = ['url','caption']
insta_db = insta_db.drop(columns, axis=1)
insta_db.head()
insta_db.tail()

In [None]:
# Remove rows with empty cells of product_id column
insta_db = insta_db.drop(insta_db[insta_db['product_id']==''].index)

In [None]:
# Final dataset of instagram based products vs likes per item

insta_updated = insta_db.assign(product_id=insta_db['product_id'].str.split(', ')).explode('product_id')
insta_updated = insta_updated.reset_index(drop=True)
insta_updated['likesCount'] = insta_updated['likesCount'].astype(int)
insta_updated = insta_updated.sort_values(by='likesCount',ascending=False)
insta_updated = insta_updated.replace(-1,0)
insta_updated.head()