# Requirements

In [1]:
import zipfile
import numpy as np
import pandas as pd
import os
import glob
import requests
import gzip
from IPython.display import Image
from VGG import get_image_embeddings, get_similarity_score, load_image_bytes

# Unzipping the dataset

In [2]:
with zipfile.ZipFile("Datasets/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("Datasets")

# Reading the csv

In [2]:
path = os.getcwd() + "/Datasets"
csv_files = glob.glob(os.path.join(path, "*.csv"))
csv_dfs = {}
for f in csv_files:
    file_name = f.split("\\")[-1]
    
    # remove the .csv extension
    file_name = file_name[:-4]
    
    # read the csv file 
    df = pd.read_csv(f)
    csv_dfs[file_name] = df

## Usage

In [6]:
display(csv_dfs["Amazon Fashion"])

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
0,Aqualogica Glow+ Dewy Sunscreen SPF 50 PA+++ F...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/51TSC6Uogx...,https://www.amazon.in/Aqualogica-Sunscreen-Pro...,4.2,3628,₹351,₹399
1,MARVIK Soft Silicone Adjustable Band Strap Com...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/51vYDop04S...,https://www.amazon.in/MARVIK-Silicone-Adjustab...,3.9,291,₹249,₹999
2,108 Panchatantra Stories for Children (Illustr...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/81VJ+MAc7Y...,https://www.amazon.in/108-Panchatantra-Stories...,4.4,2950,₹125,₹160
3,Black and White and Blue: Adult Cinema From th...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/51RIhaW3t1...,https://www.amazon.in/Black-White-Blue-Cinema-...,4.2,8,,₹584.16
4,SAFARI 15 Ltrs Sea Blue Casual/School/College ...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/61kmCas5OC...,https://www.amazon.in/SAFARI-Ltrs-Casual-Backp...,4.0,17985,₹299,₹799
...,...,...,...,...,...,...,...,...,...
2347,"Lakme Lip Love Gelato Lip Balm - Pink, Bubbleg...",stores,Amazon Fashion,https://m.media-amazon.com/images/I/61lc+uAfF0...,https://www.amazon.in/LAKM%C3%89-Love-Gelato-C...,4.1,515,₹148,₹199
2348,SATTVA Classy.Elegant.Stylish Classic XXXL Bea...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/71zaq78pO8...,https://www.amazon.in/Sattva-Classic-Filled-Be...,4.2,10,"₹2,170","₹4,299"
2349,Fastrack Glitch Analog Rose Gold Dial Women's ...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/61B+fqeyzj...,https://www.amazon.in/Fastrack-Glitch-Analog-W...,3.5,16,,"₹5,595"
2350,NAINVISH Women/Girl's Pure Cotton Paisley Prin...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/81qATlsYzk...,https://www.amazon.in/NAINVISH-Cotton-Paisley-...,3.4,318,₹499,"₹2,599"


---
# Saving the images embeddeding

### embed the images

In [3]:
# download image
def download_image(url):
    img_data = requests.get(url).content
    return img_data

In [4]:
def embed_image_from_url(url):
    img_data = download_image(url)
    img = load_image_bytes(img_data)
    embeddings = get_image_embeddings(img)
    return embeddings

In [17]:
if not os.path.exists("images"):
    os.mkdir("images/")
    
# .keep file to keep the directory in git
with open("images/.keep", 'w') as f:
    pass

with gzip.open('images/images.npy.gz', 'wb') as f:
    file = "Air Conditioners"
    urls = csv_dfs[file]["image"]
    images_names = [f"{i}.jpg" for i in range(1, len(urls) + 1)]
    
    for i, url in enumerate(urls):
        embedding = embed_image_from_url(url)
        np.save(f, embedding)
        # print progress every 5% of the way
        if i % (len(urls) // 20) == 0:
            print(f"{i} / {len(urls)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
0 / 720
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 366ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 291ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

KeyboardInterrupt: 

### Usage

In [16]:
# load the embeddings
with gzip.open('images/images.npy.gz', 'rb') as f:
    for _ in range(10):
        arr = np.load(f)
        print(arr.shape)

(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)
(1, 512)


---
# Preprocessing

### Amazon Fashion

In [8]:
amazon_fashion_df = csv_dfs["Amazon Fashion"]
# drop rows with missing values  # TODO: can discount be Nan?
amazon_fashion_df = amazon_fashion_df.dropna()
# drop duplicates
amazon_fashion_df = amazon_fashion_df.drop_duplicates()
# drop rows with links that are not amazon links  # TODO: check if this is necessary
amazon_fashion_df = amazon_fashion_df[amazon_fashion_df["link"].str.contains("amazon.com")]
# drop rows with links that don't work  # FIXME: how?
amazon_fashion_df = amazon_fashion_df[amazon_fashion_df["link"].str.contains("404")]
# drop columns that are not needed
amazon_fashion_df = amazon_fashion_df.drop(columns=["main_category"])
# normalize the ratings column
amazon_fashion_df["rating"] = amazon_fashion_df["rating"] / 5.0

KeyboardInterrupt: 

---
# Visualizing the data

## Reality Check

## Exploration