## Overview

### Gemini

Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini Pro Vision and Gemini Pro models.

### Vertex AI Gemini API

- **Gemini 1.0 Pro Vision model** (`gemini-1.0-pro-vision`): Supports multimodal prompts. You can include text, images, and video in your prompt requests and get text or code responses.

### Install Vertex AI SDK for Python


In [1]:
!pip install ray



In [2]:
!pip install google-cloud-aiplatform
!pip install boto3
!pip install s3fs

Collecting botocore<1.36.0,>=1.35.31
  Using cached botocore-1.35.33-py3-none-any.whl (12.6 MB)
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.35.23
    Uninstalling botocore-1.35.23:
      Successfully uninstalled botocore-1.35.23
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.15.1 requires botocore<1.35.24,>=1.35.16, but you have botocore 1.35.33 which is incompatible.[0m[31m
[0mSuccessfully installed botocore-1.35.33
Collecting botocore<1.35.24,>=1.35.16
  Using cached botocore-1.35.23-py3-none-any.whl (12.6 MB)
Installing collected packages: botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.35.33
    Uninstalling botocore-1.35.33:
      Successfully uninstalled botocore-1.35.33
[31mERROR: pip's dependency resolver does not curren

In [3]:
import ray
import os
import pandas as pd
import s3fs
import sys
import json
import boto3
from io import StringIO
import datetime
import time
import vertexai
from vertexai.generative_models import GenerativeModel, Image, Part

  from .autonotebook import tqdm as notebook_tqdm
2024-10-04 08:40:21,791	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Parallelisation

# Authentication

In [4]:

def set_gcloud_adc_env():
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "abg-intelact-genai-platform-0c51beb9d67c.json"
    print("Successfully set environment credentials.")


In [5]:
with open('s3connection.json', 'r') as openfile:
    pwd = json.load(openfile)

# Storage Utility

In [6]:

def search_file(filename, search_path):
    """Searches for a file in the specified path and its subdirectories."""

    for root, dirs, files in os.walk(search_path):
        if filename in files:
            return os.path.join(root, filename)
    return None


def S3_setup(pwd):
    s3 = s3fs.S3FileSystem(anon=False,key=pwd['key'],secret=pwd['secret'])
    s3bucket="oab-aw-data-insights-s3"
    return s3bucket, pwd


def storeS3(df, s3bucket, s3pwd, s3path, s3filename):

    session = boto3.Session(aws_access_key_id = pwd['key'], aws_secret_access_key = pwd['secret'])
    s3_resource = session.resource('s3')
    csv_buffer = StringIO()
    bk  = s3bucket
    obj = s3path+s3filename

    df.to_csv(csv_buffer, index = False)
    s3_resource.Object(bk , obj).put(Body=csv_buffer.getvalue())

    print('Success!')
    pathstring1= 'https://ap-south-1.console.aws.amazon.com/s3/object/oab-aw-data-insights-s3?region=ap-south-1&bucketType=general&prefix='
    pathstring2= obj
    pathstring3= '&showversions=false'

    link = pathstring1+pathstring2+pathstring3
    return link


# Cleaning Utility

In [7]:


def dataCleaning(df):

    """
    ------------------------------------------------------------
    FILL NA WITH 0 VALUE FOR ALL COLUMNS
    ------------------------------------------------------------
    """
    df.fillna(0, inplace=True)


    """
    ------------------------------------------------------------
    GET DATA TYPE FOR COLUMNS
    ------------------------------------------------------------
    """
    dtype_df=pd.DataFrame(columns=['column_name','data_type'])
    for c in df.columns:
        _tempdf = pd.DataFrame({'column_name':[c],'data_type':[df[c].dtypes] })
        dtype_df = pd.concat([dtype_df, _tempdf], ignore_index=True)


    """
    ------------------------------------------------------------
    GET LIST OF STRING COLUMNS FROM THE DATA
    ------------------------------------------------------------
    """

    DIM_COLS = list(dtype_df[dtype_df.data_type=='object']['column_name'].unique())


    """
    ------------------------------------------------------------
    CLEAN THE STRING COLUMNS, REPLACE COMMA BY PIPE OPERATOR
    ------------------------------------------------------------
    """

    CLEAN_COLS = []

    for c in df.columns:
        cnt=0
        if c in DIM_COLS:
            df[c] = df[c].astype(str)
            for v in df[c].unique():
                if v.find(",")>-1:
                    cnt=cnt+1

        if cnt>0:
            CLEAN_COLS.append(c)

    for c in CLEAN_COLS:
        df[c] = df[c].str.replace(",","|")

    return df


# GCP project Credentials

In [8]:
def project_setup():

    # Set Google Cloud project information and initialize Vertex AI SDK

    PROJECT_ID = "abg-intelact-genai-platform"
    # LOCATION = "us-central1"

    # vertexai.init(project=PROJECT_ID, location=LOCATION)
    vertexai.init(project=PROJECT_ID)
    multimodal_model = GenerativeModel("gemini-1.0-pro-vision")

    return multimodal_model

# Image data Utility functions

In [9]:
import http.client
import typing
import urllib.request

import IPython.display
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps


def display_images(
    images: typing.Iterable[Image],
    max_width: int = 600,
    max_height: int = 350,
) -> None:
    for image in images:
        pil_image = typing.cast(PIL_Image.Image, image._pil_image)
        if pil_image.mode != "RGB":
            # RGB is supported by all Jupyter environments (e.g. RGBA is not yet)
            pil_image = pil_image.convert("RGB")
        image_width, image_height = pil_image.size
        if max_width < image_width or max_height < image_height:
            # Resize to display a smaller notebook image
            pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))
        IPython.display.display(pil_image)


def get_image_bytes_from_url(image_url: str) -> bytes:
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


def load_image_bytes_from_url(image_url: str) -> Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return image_bytes

def get_url_from_gcs(gcs_uri: str) -> str:
    # converts GCS uri to url for image display.
    url = "https://storage.googleapis.com/" + gcs_uri.replace("gs://", "").replace(
        " ", "%20"
    )
    return url


def print_multimodal_prompt(contents: list):
    """
    Given contents that would be sent to Gemini,
    output the full multimodal prompt for ease of readability.
    """
    for content in contents:
        if isinstance(content, Image):
            display_images([content])
        elif isinstance(content, Part):
            url = get_url_from_gcs(content.file_data.file_uri)
            IPython.display.display(load_image_from_url(url))
        else:
            print(content)



In [10]:
import signal
import time


class TimeoutExpired(Exception):
    pass

def alarm_handler(signum, frame):
    raise TimeoutExpired



In [11]:
def get_gemini_response(image_url, prompt_string1):
    image = load_image_from_url(image_url)
    prompt = "Give probability score on these eight parameteres - " + prompt_string1 +" in json format."
    contents = [image,prompt]
    responses = multimodal_model.generate_content(contents, stream=True)
    # print_multimodal_prompt(contents)
    jsonData=response_to_json(responses)
    return jsonData

def get_gemini_organic_response(image_url, prompt_string2):

    image = load_image_from_url(image_url)
    prompt = prompt_string2
    contents = [image,prompt]
    responses = multimodal_model.generate_content(contents, stream=True)

    _sentence = ""
    for r in responses:
        _sentence = _sentence+ " " + str(r.text)
    return _sentence


def get_gemini_product_type_response(image_url, prompt_string3):
    image = load_image_from_url(image_url)
    prompt = prompt_string3
    contents = [image,prompt]
    responses = multimodal_model.generate_content(contents, stream=True)

    _sentence = ""
    for r in responses:
        _sentence = _sentence+ " " + str(r.text)
    return _sentence

def get_gemini_product_type_response(image_url, prompt_string3):
    image = load_image_from_url(image_url)
    prompt = prompt_string3
    contents = [image, prompt]
    responses = multimodal_model.generate_content(contents, stream=True)
    # print_multimodal_prompt(contents)
    _sentence = ""
    for r in responses:
        _sentence = _sentence+ " " + str(r.text)
    return _sentence



def get_gemini_multiPrompt_response(image, _prompt):
    contents = [image, _prompt]
    responses = multimodal_model.generate_content(contents, stream=True)
    _sentence = ""
    for r in responses:
        _sentence = _sentence+ " " + str(r.text)
    return _sentence



def get_gemini_flash_1_5_multiPrompt_response(image, _prompt):
    response = multimodal_model.generate_content(image +[_prompt], stream=False)
    return response.text


## Read Data

In [12]:
def load_and_prepare_data(filename, pwd):

    s3 = s3fs.S3FileSystem(anon=False,key=pwd['key'],secret=pwd['secret'])
    s3bucket="oab-aw-data-insights-s3"

    try:
        df = pd.read_csv(filename)
    except:
        df = pd.read_csv(s3.open(os.path.join(filename),'rb'), encoding='Latin-1')

    print(df.Brand.value_counts().reset_index())
    _brands = list(df.Brand.unique())

    # _brands = ['Golden Goose', 'Autry']
    # df0 = df[df.Brand == _brands[0]].head(20)
    # df1 = df[df.Brand == _brands[1]].head(20)
    # df = pd.concat([df0, df1])

    # df0 = df[df.Brand == _brands[0]].head(100)
    # df1 = df[df.Brand == _brands[1]].head(100)
    # df2 = df[df.Brand == _brands[2]].head(100)
    # df3 = df[df.Brand == _brands[3]].head(100)
    # df4 = df[df.Brand == _brands[4]].head(100)
    # df = pd.concat([df0, df1, df2, df3, df4])

    df = df.reset_index()
    df.drop(columns='index', inplace=True)

    print(df.shape)
    print(df.Brand.value_counts().reset_index())

    print(df.head(3))

    return df

# Transformation and storage

In [13]:
def transform_and_store_output(dfmain, pwd, all_product_output_list, s3path, filename ):

    df_ = pd.json_normalize(all_product_output_list).T.reset_index()
    df_.columns=['Product Image', 'attr']
    df_.head(3)


    newcolumns = ['Brand',
                'Collaborations',
                'Limited_Edition',

                'Product_details',

                'Colors',
                'Design_Elements',
                'Influence',

                'Price_Range',
                'Positioning',

                'Consumer_Demographics',
                'Consumer_Lifestyle_Preference',
                'Consumer_Fashion_Sense',

                'Occassion_or_Context',

                'Heritage',
                'Exclusivity'
              ]

    df_[newcolumns] = df_['attr'].to_list()
    df_.drop(columns='attr', inplace=True)

    dfmain = pd.merge(dfmain, df_, on='Product Image', how='left')

    print(filename)
    dfmain.to_csv(filename, index=False)

    dfmain = dataCleaning(dfmain)

    s3bucket, pwd = S3_setup(pwd)

    s3DirectAccessLink = storeS3(dfmain, s3bucket=s3bucket, s3pwd = pwd, s3path=s3path, s3filename= filename)
    print("Link:", s3DirectAccessLink)



In [14]:
def store_ImageByte(dfmain, pwd, s3path, filename ):

    s3bucket, pwd = S3_setup(pwd)
    s3DirectAccessLink = storeS3(dfmain, s3bucket=s3bucket, s3pwd = pwd, s3path=s3path, s3filename= filename)
    print("Link:", s3DirectAccessLink)


In [15]:


def get_gemini_output(u):

    one_product_output_list = []
    to=0

    try:
        timeout = 200
        signal.signal(signal.SIGALRM, alarm_handler)
        signal.alarm(timeout)

        _image = load_image_from_url(u)

        if i%10 == 0 and to==0:
            time.sleep(10)

        for _prompt in prompt_string_list:
            time.sleep(5)
            try:
                model_output = get_gemini_multiPrompt_response(_image, _prompt)
                print("\n",model_output)
                one_product_output_list.append(model_output)
                to=0

            except:
                model_output = 'ResourceExhausted'
                one_product_output_list.append(model_output)
                print('ResourceExhausted Error')
                time.sleep(5)

    except TimeoutExpired:
        print("Timeout occurred")
        to=1
        one_product_output_list = ['TimeOut']*len(prompt_string_list)

    finally:
        signal.alarm(0)


    return one_product_output_list


In [16]:
set_gcloud_adc_env()

Successfully set environment credentials.


# Prompts for Gemini

In [17]:




# prompt_string_list = [
#     "Analyze the product image and describe from a fashion expert perspective in less than 100 words.",
#     "Analyze the product image and describe it in exactly five keywords on design of this product from fashion expert perspective, seperated by semicolon",
#     "Analyze the product image and return category (for example : shoes), sub_category (for example : sneakers), product_type (for example : high tops), gender (e.g. Male or Female), all colours ( e.g. red, white, blue), dominant colour (e.g. red), recent selling price range of the product in GBP currency (e.g. 100-130 GBP),  best season to use this product, material composition of this product,  seperated by semi colon.",
#     "Analyze the product image and describe what majority of customers like about this product",
#     "Analyze the product image and explain if this product is in a declining trend or increasing trend",
#     "Analyze the product image and tell who is the target customer for this"
# ]

# prompt_string_list = [
#     "Analyze the product image and describe from a fashion expert perspective in less than 100 words.",
#     "Analyze the product image and describe it in exactly five keywords on design of this product from fashion expert perspective, seperated by semicolon",
#     "Analyze the product image and return category, sub_category, product_type, gender, all colours, dominant colour, recent selling price range of the product in GBP currency,  best season to use this product, material composition of this product, is an occasion or casual wear, first ever release year month, seperated by semi colon.",
#     "Please analyse the product image and just provide if there were any collaboration associated to the brand of this product with any sports, celebrity or organisation in one sentence",
#     "Please analyze the product image, demographics of the target customer of this product and return Gender, Age group, profession, Income group, Region, fashion preference, Social media preference, Sports preference, seperated by semi colon."
# ]

prompt_string_list = [

    "Identify the brand",
    "Highlight any known heritage, including significant collaborations (e.g., with designers or other brands) or endorsements (celebrity or influencer affiliations) tied to the product",
    "Mention if the product is part of any special collections or limited editions",

    "Determine the product category (e.g., apparel, footwear, accessories), specify the sub-category within the category (e.g., sneakers, handbags, jackets), define the exact product type (e.g., high-top sneakers, leather backpack, bomber jacket)",

    "Describe the product’s dominant color and any secondary colors",
    "Highlight key design elements, such as patterns, materials, textures, or unique features (e.g., embellishments, stitching, logos)",
    "Assess whether the product's design reflects any cultural, regional, or symbolic influence (e.g., traditional patterns, local craftsmanship, or region-specific designs)",

    "Based on the brand, product type, and potential collaborations or endorsements, suggest a tentative price range in USD",
    "Evaluate the product’s market positioning (e.g., luxury, premium, mass-market, niche)",

    "Suggest the likely age group, gender, and socioeconomic background of the target consumer",
    "Describe the typical lifestyle preferences of the consumer (e.g., urban, rural, professional, creative, athletic)",
    "Define the consumer’s fashion sense (e.g., casual, formal, trendy, avant-garde, luxury)",

    "Suggest specific occasions or contexts where this product could be worn or used by the target consumer like Social events: (e.g., parties, weddings, or dinners) or Professional settings: (e.g., work meetings, corporate events) or Casual outings: (e.g., coffee dates, shopping trips) or Seasonal use: (e.g., summer vacations, winter getaways) or Sports or activities: (e.g., gym, hiking, outdoor events) or Special or cultural occasions: (e.g., festivals, religious ceremonies) or Travel or leisure: (e.g., holidays, weekend trips, casual city strolls)",

    "Assess whether the product has any historical significance or represents part of a heritage collection",
    "Identify if the product is associated with exclusivity through limited editions, collaborations, or rare releases that might elevate its appeal to collectors or fashion enthusiasts"

    ]


prompt_string_list = [

    "Determine the product category (e.g., apparel, footwear, accessories), specify the sub-category within the category (e.g., sneakers, handbags, jackets), define the exact product type (e.g., high-top sneakers, leather backpack, bomber jacket)",
    "Describe the product’s dominant color and any secondary colors",
    "Evaluate the product’s market positioning (e.g., luxury, premium, mass-market, niche)",
    "Suggest the likely age group, gender, and socioeconomic background of the target consumer"
    ]


# Main 1

In [28]:
multimodal_model = project_setup()
url = 'Product Image'

filename = 's3://oab-aw-data-insights-s3/datarepo/transformed/testing/brand_d2c/brandD2C_marketplace_gemini_ai_2024-10-03.csv'
# filename = 'ecommerce_marketplace_subset_gemini_ai_2024-09-29.csv'
df = load_and_prepare_data(filename, pwd)

                 index  Brand
0                  APL   1425
1  Christian Louboutin    949
2                Sorel    875
3            Represent    820
4                Amiri    734
5                Autry    641
6            Cole Haan    567
7                 Veja    560
(6571, 33)
                 index  Brand
0                  APL   1425
1  Christian Louboutin    949
2                Sorel    875
3            Represent    820
4                Amiri    734
5                Autry    641
6            Cole Haan    567
7                 Veja    560
  Country                        Website  \
0      UK  https://www.autry-usa.com/en/   
1      UK  https://www.autry-usa.com/en/   
2      UK  https://www.autry-usa.com/en/   

                                         Product URL   Product Code  \
0  https://www.autry-usa.com/en/woman/apparel/bot...  8056713266648   
1  https://www.autry-usa.com/en/woman/apparel/top...  8056713265498   
2  https://www.autry-usa.com/en/man/apparel/top/a...  80567

In [29]:
footwear_category_list = ['footwear', 'sneakers', 'shoes', 'new arrivals woman', 'new arrivals man']

In [30]:
df['Product Category'].str.lower().value_counts(dropna=False)

df.shape

df = df[df['Product Category'].str.lower().isin(footwear_category_list)]

df.shape

df['Product Category'].str.lower().value_counts(dropna=False)

df.columns

Index(['Country', 'Website', 'Product URL', 'Product Code', 'Product Name',
       'Product Category', 'Product sub Category', 'Product type',
       'Product Image', 'Brand', 'Colorway', 'Availability', 'Gender',
       'Price Currency', 'MRP', 'Price', 'Discount', 'Size', 'Release Date',
       'Tags', 'Description', 'attributes', 'Material', 'Country of origin',
       'Ratings', 'Reviews', 'Vendor', 'Images', 'rank', 'Crawl_Date',
       'data_name', 'Release_Month', 'datestored'],
      dtype='object')

# Main 2

In [31]:
df.Brand.value_counts(dropna=False)

APL                    1423
Sorel                   875
Veja                    560
Christian Louboutin     489
Cole Haan               402
Autry                   327
Amiri                   133
Represent                95
Name: Brand, dtype: int64

In [32]:
BR = list(df.Brand.unique())
BR

['Autry',
 'Amiri',
 'Christian Louboutin',
 'Veja',
 'APL',
 'Cole Haan',
 'Represent',
 'Sorel']

In [33]:
df_URL = df[~(df.Brand.isin(['Autry', 'APL']))][['Product Image']]

In [34]:
df_URL.head(3)

Unnamed: 0,Product Image
650,https://amiri.com/cdn/shop/files/d7dc9e74-a124...
676,https://amiri.com/cdn/shop/products/6_Pre-SS24...
677,https://amiri.com/cdn/shop/products/9_Pre-SS24...


In [35]:
df_URL = df_URL.reset_index()
df_URL.drop(columns='index', inplace=True)

In [36]:
df_URL.shape

(2554, 1)

In [37]:
def timeout_for_loading_Image(u):

    for i in range(3):

        try:
            timeout = 15
            signal.signal(signal.SIGALRM, alarm_handler)
            signal.alarm(timeout)
            _imagebytes = load_image_bytes_from_url(u)
            break;

        except TimeoutExpired:
            print('T'+str(i))
            _imagebytes = ''

        finally:
            signal.alarm(0)

    return _imagebytes


In [38]:
URL = list(df_URL[url].unique())
len(URL)

2554

In [39]:
ray.shutdown()
ray.init()
print(ray.available_resources()['CPU'])
ray.init(num_cpus=6, ignore_reinit_error=True)
print(ray.available_resources()['CPU'])

2024-10-04 08:42:34,708	INFO worker.py:1786 -- Started a local Ray instance.
2024-10-04 08:42:35,832	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.


8.0
8.0


In [40]:
@ray.remote
def get_data_parallel(u):
    try:
        _imagebytes = timeout_for_loading_Image(u)
    except:
        _imagebytes = 'Forbidden_403_error'

    return _imagebytes

In [41]:
IMAGE_byte = [get_data_parallel.remote(u) for u in URL]
IMAGE_byte_data = ray.get(IMAGE_byte)

# Gemini Response

In [227]:
ray.shutdown()
ray.init()
print(ray.available_resources()['CPU'])
ray.init(num_cpus=6, ignore_reinit_error=True)
print(ray.available_resources()['CPU'])

2024-10-04 15:18:48,653	INFO worker.py:1786 -- Started a local Ray instance.
2024-10-04 15:18:49,936	INFO worker.py:1619 -- Calling ray.init() again after it has already been called.


8.0
8.0


In [43]:
@ray.remote
def get_Gemini_response_using_ImageByte(_imageByte):

    _image = Image.from_bytes(_imageByte)
    one_product_output_list = []

    for _prompt in prompt_string_list:
        time.sleep(4)
        try:
            model_output = get_gemini_multiPrompt_response(_image, _prompt)
            one_product_output_list.append(model_output)
        except:
            one_product_output_list.append('unknown_error')

    return one_product_output_list


In [44]:
all_product_output_list = [get_Gemini_response_using_ImageByte.remote(ib) for ib in IMAGE_byte_data]

In [45]:
GEMINI_response = ray.get(all_product_output_list)

# S3 Storage

In [46]:
df_result = pd.DataFrame({'URL':URL, 'GeminiResponse':GEMINI_response})

In [47]:
df_result.shape

(2554, 2)

In [48]:
df_result.tail(3)

Unnamed: 0,URL,GeminiResponse
2551,https://columbia.scene7.com/is/image/ColumbiaS...,"[ The product is footwear, more specifically ..."
2552,https://columbia.scene7.com/is/image/ColumbiaS...,"[ The product is footwear, and the sub-catego..."
2553,https://columbia.scene7.com/is/image/ColumbiaS...,[ - Category: Footwear\n- Sub-category: Clogs...


In [49]:

newcolumns = ['Brand',
            'Collaborations',
            'Limited_Edition',

            'Product_details',

            'Colors',
            'Design_Elements',
            'Influence',

            'Price_Range',
            'Positioning',

            'Consumer_Demographics',
            'Consumer_Lifestyle_Preference',
            'Consumer_Fashion_Sense',

            'Occassion_or_Context',

            'Heritage',
            'Exclusivity'
          ]


newcolumns = [
            'Product_details',
            'Colors',
            'Positioning',
            'Consumer_Demographics'
          ]

df_result[newcolumns] = df_result['GeminiResponse'].to_list()
df_result.drop(columns='GeminiResponse', inplace=True)

In [50]:

dt = '2024-10-04'
filename = 'Brand_LinePlan_Gemini_Prompt_Response_1'+dt+'.csv'
s3path = "datarepo/gemini_vision_pro_output/testing/ecommerce/"

s3bucket, pwd = S3_setup(pwd)
s3DirectAccessLink = storeS3(df_result, s3bucket=s3bucket, s3pwd = pwd, s3path=s3path, s3filename= filename)
print("Link:", s3DirectAccessLink)


Success!
Link: https://ap-south-1.console.aws.amazon.com/s3/object/oab-aw-data-insights-s3?region=ap-south-1&bucketType=general&prefix=datarepo/gemini_vision_pro_output/testing/ecommerce/Brand_LinePlan_Gemini_Prompt_Response_12024-10-04.csv&showversions=false


In [51]:
ray.shutdown()

# Code Ends

In [54]:
df_result.shape

(2554, 5)

In [53]:
df_result[df_result.Product_details=='unknown_error'].shape

(515, 5)

In [55]:
df.shape

(4304, 33)

In [56]:
df = pd.merge(df, df_result, left_on='Product Image', right_on='URL', how='inner')
df.head(3)

Unnamed: 0,Country,Website,Product URL,Product Code,Product Name,Product Category,Product sub Category,Product type,Product Image,Brand,...,rank,Crawl_Date,data_name,Release_Month,datestored,URL,Product_details,Colors,Positioning,Consumer_Demographics
0,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/skel-top-low-...,6896740073537,SKEL-TOP LOW - BLACK/WHITE,Footwear,Footwear,MENS CORE COLLECTION FOOTWEAR,https://amiri.com/cdn/shop/files/d7dc9e74-a124...,Amiri,...,10,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/files/d7dc9e74-a124...,Product category: Footwear\nProduct sub-cate...,The dominant color is white. The secondary c...,The product is a pair of sneakers. It is a l...,The target consumer is likely to be a young ...
1,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/women-stars-l...,7028769259585,WOMEN - WOMEN'S STARS LOW - Birch,Footwear,Footwear,PS24WFS004-699,https://amiri.com/cdn/shop/products/6_Pre-SS24...,Amiri,...,36,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/products/6_Pre-SS24...,"The product is footwear, more specifically s...",The dominant color is off-white and the seco...,The product is a pair of sneakers. It is a l...,The target consumer for these shoes is likel...
2,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/women-ma-runn...,7028769423425,WOMEN - WOMEN'S MA RUNNER - Brown,Footwear,Footwear,Pre-Spring 2024,https://amiri.com/cdn/shop/products/9_Pre-SS24...,Amiri,...,37,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/products/9_Pre-SS24...,"Footwear, Sneakers, Running Shoes",The dominant color is beige and the secondar...,The product appears to be a mid-range runnin...,- The target consumer is likely to be betwee...


In [57]:
df.to_csv(filename, index=False)

In [247]:
df_main = pd.concat([
    pd.read_csv('./GeminiOutput_LinePlan/Brand_LinePlan_Gemini_Prompt_Response_12024-10-04.csv'),
    pd.read_csv('./GeminiOutput_LinePlan/Brand_LinePlan_Gemini_Prompt_Response_22024-10-04.csv')
])

In [248]:
df_main.shape

(3977, 38)

In [250]:
df_main = df_main[~df_main.Product_details.isin(['Forbidden_403_error', 'unknown_error'])]

In [251]:
df_main.shape

(2997, 38)

In [253]:
df_main.head(3)

Unnamed: 0,Country,Website,Product URL,Product Code,Product Name,Product Category,Product sub Category,Product type,Product Image,Brand,...,rank,Crawl_Date,data_name,Release_Month,datestored,URL,Product_details,Colors,Positioning,Consumer_Demographics
0,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/skel-top-low-...,6896740073537,SKEL-TOP LOW - BLACK/WHITE,Footwear,Footwear,MENS CORE COLLECTION FOOTWEAR,https://amiri.com/cdn/shop/files/d7dc9e74-a124...,Amiri,...,10,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/files/d7dc9e74-a124...,Product category: Footwear\nProduct sub-cate...,The dominant color is white. The secondary c...,The product is a pair of sneakers. It is a l...,The target consumer is likely to be a young ...
1,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/women-stars-l...,7028769259585,WOMEN - WOMEN'S STARS LOW - Birch,Footwear,Footwear,PS24WFS004-699,https://amiri.com/cdn/shop/products/6_Pre-SS24...,Amiri,...,36,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/products/6_Pre-SS24...,"The product is footwear, more specifically s...",The dominant color is off-white and the seco...,The product is a pair of sneakers. It is a l...,The target consumer for these shoes is likel...
2,UK,https://amiri.com/en-gb/,https://amiri.com/en-gb/products/women-ma-runn...,7028769423425,WOMEN - WOMEN'S MA RUNNER - Brown,Footwear,Footwear,Pre-Spring 2024,https://amiri.com/cdn/shop/products/9_Pre-SS24...,Amiri,...,37,20240926,amiri_uk_report_09282024.csv,2024-05,2024-10-03,https://amiri.com/cdn/shop/products/9_Pre-SS24...,"Footwear, Sneakers, Running Shoes",The dominant color is beige and the secondar...,The product appears to be a mid-range runnin...,- The target consumer is likely to be betwee...


In [252]:
df_main.to_csv('./GeminiOutput_LinePlan/Brand_LinePlan_Gemini_Prompt_Response_2024-10-04.csv', index=False)

In [256]:
df_main['Product_details'] = df_main['Product_details'].str.lower().str.replace('\n',' ')

In [257]:
df_main = df_main.reset_index()
df_main.drop(columns='index', inplace=True)

In [255]:
# document_Product_details = ' '.join(df_main['Product_details'].unique())

# df_main.head(3)

# import nltk
# from nltk.corpus import stopwords

# nltk.download('stopwords')
# print(stopwords.words('english'))

# stop_words = set(stopwords.words('english'))

# def count(elements):
#     if elements[-1] == '.':
#         elements = elements[0:len(elements) - 1]
#     if elements in dictionary:
#         dictionary[elements] += 1
#     else:
#         dictionary.update({elements: 1})

# Sentence = document_Product_details
 
# dictionary = {}
 
# lst = Sentence.split()
# filtered_sentence = []

# for w in lst:
#     if w not in stop_words:
#         filtered_sentence.append(w)

# for elements in filtered_sentence:
#     count(elements)


# df_pd = pd.json_normalize(dictionary).T.reset_index()
# df_pd.columns=['Word', 'count']
# df_pd.sort_values('count', ascending=False, inplace=True)
# df_pd.to_csv('WordFrequency.csv', index=False)
# df_pd.head(10)

# df_pd.head(50)['Word'].unique()


# # material_ = ['leather','lace-up', 'rubber', 'high-top', 'suede', 'winter', 'running', 'mesh',
# #                'waterproof', 'low-top', 'casual', 'flats']



# df_main.shape

In [260]:
details = list(df_main['Product_details'])[:100]

In [261]:
model1 = GenerativeModel("gemini-1.0-pro")

In [276]:

def get_Gemini_response_using_Text(_text):

    model_output_text = ""
    # time.sleep(1)
    try:
        responses = model1.generate_content(_text, stream=True)
        for response in responses:
            model_output_text = model_output_text +response.text
        
    except:
        model_output_text = 'unknown_error'

    return model_output_text


In [286]:
_text = "Provide a complete list of all sub category within footwear category in json format"
all_product_taxonomy = get_Gemini_response_using_Text(_text)

In [287]:
print(all_product_taxonomy)

## Footwear Subcategories in JSON format

```json
{
  "footwear": {
    "men": {
      "shoes": {
        "athletic": ["Running Shoes", "Training Shoes", "Walking Shoes", "Basketball Shoes", "Soccer Shoes", "Tennis Shoes", "Golf Shoes", "Baseball/Softball Shoes"],
        "casual": ["Sneakers", "Boat Shoes", "Chukkas", "Loafers", "Boots (Ankle, Chelsea, Chukka, Timberland)", "Sandals", "Flip Flops"],
        "dress": ["Oxfords", "Monk Strap Shoes", "Loafers", "Derbies", "Wingtips"],
        "work": ["Work Boots", "Safety Shoes", "Steel Toe Shoes", "Slip Resistant Shoes"],
        "special occasion": ["Formal Shoes", "Wedding Shoes"],
      },
      "socks": ["Dress Socks", "Casual Socks", "Athletic Socks", "Specialty Socks (Compression, No Show)"]
    },
    "women": {
      "shoes": {
        "athletic": ["Running Shoes", "Training Shoes", "Walking Shoes", "Basketball Shoes", "Tennis Shoes", "Dance Shoes", "Yoga Shoes", "Hiking Boots"],
        "casual": ["Sneakers", "Sandals", "Flats

In [266]:
# GEMINI_text_response

In [308]:
def SUB_CATEGORY(row):
    text = ''
    # subcat = ['sneakers', 'boots', 'sandals', 'slip-on', 'slippers', 'loafers', 'clogs', 'hiking', 'slides', 'pumps',
    #           'boat', 'golf', 'flats', 'heels', 'dress', 'ankle'
    #          ]
    
    subcat = [
    'sneaker','athletic','boat','boot','clog','formal','heel','mary jane','loafer','mule',
    'pumps','running','sandal','slide','slip-on','slipper','walking','wedge'
    ]
    
    
    for sc in subcat:
        if row['Product_details'].find(sc)>-1:
            text = sc
    
    if text=='':
        return 'unknown'
    else:
        return text

        
def PRODUCT_TYPE(row):
    text = ''
    # producttype = ['lace-up', 'high-top', 'running', 'low-top', 'heels', 'athletic',  'derby', 'athleisure']
    producttype = ['ankle-high','basketball','brogue','chunky','court','derby','dress','flats','golf','high-top','hiking','indoors',
    'low-top','mid-top','open-toe','oxford','peep-toe','platform','slingback','trail','strap','canvas'
    ]
    
    
    for pt in producttype:
        if row['Product_details'].find(pt)>-1:
            text = text + ' ' + pt
    
    if text=='':
        return 'unknown'
    else:
        return text

In [309]:
df_main['CATEGORY'] = 'Footwear'
df_main['SUB_CATEGORY'] = df_main.apply(SUB_CATEGORY, axis=1)
df_main['PRODUCT_TYPE'] = df_main.apply(PRODUCT_TYPE, axis=1)

In [313]:
df_main.SUB_CATEGORY.value_counts(dropna=False)

sneaker      884
boot         632
running      476
slip-on      323
sandal       139
unknown      124
slide        107
loafer        74
slipper       69
heel          55
clog          32
wedge         22
mary jane     20
pumps         15
athletic      11
boat           6
formal         3
mule           3
walking        2
Name: SUB_CATEGORY, dtype: int64

In [314]:
df_main.PRODUCT_TYPE.value_counts(dropna=False)

unknown                   1912
 low-top                   400
 high-top                  298
 strap                      69
 dress                      40
 platform                   25
 indoors                    24
 flats                      24
 dress oxford               20
 basketball high-top        19
 trail                      16
 canvas                     16
 slingback                  16
 high-top hiking            15
 low-top canvas             14
 chunky                     10
 open-toe                    8
 derby dress                 8
 oxford                      7
 mid-top                     6
 high-top trail              5
 hiking mid-top              5
 golf                        5
 high-top strap              4
 peep-toe                    4
 court                       3
 flats slingback             3
 hiking                      2
 high-top canvas             2
 dress strap                 2
 brogue                      1
 platform strap              1
 chunky 

In [312]:
df_main[df_main.PRODUCT_TYPE=='unknown'][df_main.SUB_CATEGORY=='sneaker']['Product_details'][398]

  df_main[df_main.PRODUCT_TYPE=='unknown'][df_main.SUB_CATEGORY=='sneaker']['Product_details'][398]


'  product category: footwear product sub-category: sneakers product type: casual sneakers '

In [291]:
df_main[df_main.PRODUCT_TYPE=='unknown'].SUB_CATEGORY.value_counts(dropna=False)

sneaker       901
boot          505
unknown       128
slide         122
sandal        110
loafer         53
slipper        47
heel           42
clog           33
wedge          22
pumps          11
athletic        7
boat            6
mary jane       5
mule            3
walking         1
athleisure      1
Name: SUB_CATEGORY, dtype: int64

In [271]:
df_main.SUB_CATEGORY.value_counts(dropna=False)

sneaker       1552
boot           627
unknown        216
sandal         155
slide          123
loafer          81
slipper         69
heel            51
clog            36
wedge           22
mary jane       20
pumps           15
athletic        15
boat             6
formal           3
mule             3
walking          2
athleisure       1
Name: SUB_CATEGORY, dtype: int64

In [315]:
df_main.shape

(2997, 41)

In [316]:
df_main.to_csv('./GeminiOutput_LinePlan/Brand_LinePlan_Gemini_Prompt_Response_2024-10-04_SUBCAT_PT.csv', index=False)