<a href="https://colab.research.google.com/github/Shankjbs571/Valora/blob/main/AMZN_ML_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract


In [None]:
!pip install paddlepaddle paddleocr opencv-python

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.6.0.tar

In [None]:
#@title Imports
import re
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

import pytesseract
import cv2

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
#@title constants

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [None]:
#@title Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title utils.py


def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder,filename, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    # filename = Path(image_link).name
    filename = filename + '.jpg'
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)


In [None]:
#@title Load train.scv as Pandas dataframe
train_df = pd.read_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train.csv')
train_1002_2002_df = pd.read_csv('/content/drive/MyDrive/train_1002_2002.csv')

train_1002_2002_df.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/91EGQ9c8ZK...,375816,item_weight,1023.0 gram
1,https://m.media-amazon.com/images/I/81L97-vmov...,375816,item_weight,1023.0 gram
2,https://m.media-amazon.com/images/I/61OY6I21l2...,483370,item_weight,1.0 kilogram
3,https://m.media-amazon.com/images/I/61oE2AbTBJ...,523149,item_weight,4.3 gram
4,https://m.media-amazon.com/images/I/61IycVGaGY...,752266,wattage,1.5 watt


In [None]:
save_folder_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0'
i = 0
for image_link,group_id,entity_name in zip(train_df['image_link'], train_df['group_id'], train_df['entity_name']) :
  print(image_link)

  filename = str(i) + '_' + str(group_id) + '_' + entity_name
  i += 1
  print(filename)
  download_image(image_link,save_folder_path,filename)
  if i == 1000:
    break



https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg
0_748919_item_weight
https://m.media-amazon.com/images/I/71gSRbyXmoL.jpg
1_916768_item_volume
https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg
2_459516_item_weight
https://m.media-amazon.com/images/I/612mrlqiI4L.jpg
3_459516_item_weight
https://m.media-amazon.com/images/I/617Tl40LOXL.jpg
4_731432_item_weight
https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg
5_731432_item_weight
https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg
6_731432_item_weight
https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg
7_731432_item_weight
https://m.media-amazon.com/images/I/91Cma3RzseL.jpg
8_731432_item_weight
https://m.media-amazon.com/images/I/71jBLhmTNlL.jpg
9_731432_item_weight
https://m.media-amazon.com/images/I/81N73b5khVL.jpg
10_149159_item_weight
https://m.media-amazon.com/images/I/61oMj2iXOuL.jpg
11_308856_item_weight
https://m.media-amazon.com/images/I/91LPf6OjV9L.jpg
12_281678_item_weight
https://m.media-amazon.com/images/I/81fOxWWWKYL.

In [None]:

save_folder_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0'

data = []

i = 999
for index, row in train_1002_2002_df.iterrows():
    image_link = row['image_link']
    group_id = row['group_id']
    entity_name = row['entity_name']
    entity_value = row['entity_value']

    filename = f"{i}_{group_id}_{entity_name}"
    i += 1

    download_image(image_link, save_folder_path, filename)

    file_path = os.path.join(save_folder_path, filename + '.jpg')

    data.append({
        'image_link': image_link,
        'downloaded_image_path': file_path,
        'group_id': group_id,
        'entity_name': entity_name,
        'entity_value': entity_value
    })

    # if i == 1000:
    #     break
    if index % 10 == 0:
        print(f"downloaded {index} images")

output_df = pd.DataFrame(data)
output_df.to_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train_D_1002_2002.csv', index=False)

print("Processing complete. Data saved to 'processed_images.csv'.")


downloaded 0 images
downloaded 10 images
downloaded 20 images
downloaded 30 images
downloaded 40 images
downloaded 50 images
downloaded 60 images
downloaded 70 images
downloaded 80 images
downloaded 90 images
downloaded 100 images
downloaded 110 images
downloaded 120 images
downloaded 130 images
downloaded 140 images
downloaded 150 images
downloaded 160 images
downloaded 170 images
downloaded 180 images
downloaded 190 images
downloaded 200 images
downloaded 210 images
downloaded 220 images
downloaded 230 images
downloaded 240 images
downloaded 250 images
downloaded 260 images
downloaded 270 images
downloaded 280 images
downloaded 290 images
downloaded 300 images
downloaded 310 images
downloaded 320 images
downloaded 330 images
downloaded 340 images
downloaded 350 images
downloaded 360 images
downloaded 370 images
downloaded 380 images
downloaded 390 images
downloaded 400 images
downloaded 410 images
downloaded 420 images
downloaded 430 images
downloaded 440 images
downloaded 450 images

In [None]:

image_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/0_748919_item_weight.jpg'

image = cv2.imread(image_path)

gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

text = pytesseract.image_to_string(gray_image)

print(text)

PROPSS’
NATURE

INGREDIENT MENAGER

COOLANT

 



In [None]:
#@title experiment
image_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/1_916768_item_volume.jpg'
image_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/2_459516_item_weight.jpg'
image_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/2_459516_item_weight.jpg'

image = cv2.imread(image_path)
resized_image = cv2.resize(image, (1500, 1100))

gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

_, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
denoised_image = cv2.fastNlMeansDenoising(binary_image, None, 30, 7, 21)

text = pytesseract.image_to_string(binary_image)

print(text)

cv2.imwrite('original_image.jpg', image)
cv2.imwrite('resized_image.jpg', resized_image)
# cv2.imwrite('gray_image.jpg', gray_image)
cv2.imwrite('binary_image.jpg', binary_image)
cv2.imwrite('denoised_image.jpg', denoised_image)

NameError: name 'ocr_model' is not defined

In [None]:
#@title Function to Extract Text
def extract_text_from_image(image_path, output_prefix='processed_image'):
    image = cv2.imread(image_path)

    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return

    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    text = pytesseract.image_to_string(binary_image)

    # Print the extracted text
    # print(f"Extracted text from {image_path}:")
    # print(text)
    return text


In [None]:
from paddleocr import PaddleOCR
# import cv2
# import matplotlib.pyplot as plt

ocr_model = PaddleOCR(use_angle_cls=True,lang='en')


In [None]:
#@title Funtion to Extract text using paddle ocr


image_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/2_459516_item_weight.jpg'

def extract_text_from_image_paddle(image_path):
    try:
      image = cv2.imread(image_path)

      if image is None:
          print(f"Error: Unable to load image at {image_path}")
          return ""

      rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

      ocr_result = ocr_model.ocr(rgb_image)

      if ocr_result is not None and len(ocr_result) > 0 and len(ocr_result[0]) > 0:
          extracted_text = ' '.join([line[1][0] for line in ocr_result[0]])
          return extracted_text
      else:
          return ''
    except Exception as e:
      print(f"Error processing image: {image_path}, Error: {e}")
      return ''




print(extract_text_from_image_paddle(image_path))


[2024/09/16 01:45:02] ppocr DEBUG: dt_boxes num : 53, elapsed : 0.34856605529785156
[2024/09/16 01:45:02] ppocr DEBUG: cls num  : 53, elapsed : 0.19162249565124512
[2024/09/16 01:45:08] ppocr DEBUG: rec_res num  : 53, elapsed : 5.252565145492554
COMPOSITION Serving Size:1 Tablet (0.709 g)Each serving contains (Approx.Values) Ingredient Qty./Serving %RDA" *PHOSPHOcomplexSilybin (Sillybum marianum) 200 mg ** Dandelion Taraxacum officinale) leaf extract-101 100 mg ** Kutki Picrorhiza kurroa)rhizome extract-0.5%Bitters 50 mg ** Kasani Cichorium intybus seed extract-1%Bitters 25 mg ** Punarnava Boerhavia diffusa root extract-0.07%alkaloids 25 mg ** Bhui amla Phyllanthus amarusWP extract-0.5%Bitters 25 mg Amla (Phyllanthus emblica) fruit extract-10%Tannins 25 mg Licorice (Glycyrrhiza glabra) root extract- 5%Glycyrrhizin 25 mg Vitamin E 10 mg 100 Piper nigrum fruit extract-95%Piperine 5 mg ** NUTRITIONAL INFORMATION PER SERVING (APPROX.VALUES) Nutrients Qty./Serving %RDA# Energy 3.04kcal 0.13

In [None]:
# print(process_and_save_image('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/trainImages0/2_459516_item_weight.jpg'))

# input_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train_1000.csv'
input_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train_D_1002_2002.csv'
# df = pd.read_csv(input_csv_path)

# updated_data = []

# for index, row in df.iterrows():
#     image_path = row['downloaded_image_path']
#     extracted_text = extract_text_from_image(image_path)

#     updated_data.append({
#         'image_link': row['image_link'],
#         'downloaded_image_path': row['downloaded_image_path'],
#         'group_id': row['group_id'],
#         'entity_name': row['entity_name'],
#         'entity_value': row['entity_value'],
#         'extracted_text': extracted_text
#     })
#     if index % 10 == 0:
#         print(f"Processed {index} images")


# updated_df = pd.DataFrame(updated_data)
# output_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text_1002_2002.csv'
# updated_df.to_csv(output_csv_path, index=False)

# print(f"Processing complete. Data saved to '{output_csv_path}'.")

def process_images_extract_text_and_save(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    updated_data = []

    for index, row in df.iterrows():
        image_path = row['downloaded_image_path']
        extracted_text = extract_text_from_image(image_path)

        updated_data.append({
            'image_link': row['image_link'],
            'downloaded_image_path': row['downloaded_image_path'],
            'group_id': row['group_id'],
            'entity_name': row['entity_name'],
            'entity_value': row['entity_value'],
            'extracted_text': extracted_text
        })

        if index % 10 == 0:
            print(f"Processed {index} images")


    updated_df = pd.DataFrame(updated_data)
    updated_df.to_csv(output_csv_path, index=False)

    print(f"Processing complete. Data saved to '{output_csv_path}'.")

def process_images_extract_text_and_save_for_test(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    updated_data = []

    for index, row in df.iterrows():
        image_path = row['downloaded_image_path']
        extracted_text = extract_text_from_image(image_path)

        updated_data.append({
            'image_link': row['image_link'],
            'downloaded_image_path': row['downloaded_image_path'],
            'group_id': row['group_id'],
            'entity_name': row['entity_name'],
            # 'entity_value': row['entity_value'],
            'extracted_text': extracted_text
        })

        if index % 10 == 0:
            print(f"Processed {index} images")
        if index == 1000:
            break


    updated_df = pd.DataFrame(updated_data)
    updated_df.to_csv(output_csv_path, index=False)

    print(f"Processing complete. Data saved to '{output_csv_path}'.")


input_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train_D_1002_2002.csv'
output_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text_1002_2002.csv'



In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text)
    print("text", text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:

# updated_images_with_text_df = pd.read_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text.csv')
updated_images_with_text_df = pd.read_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text_1002_2002.csv')

print(updated_images_with_text_df.head())

updated_images_with_text_df['cleaned_text'] = updated_images_with_text_df['extracted_text'].apply(preprocess_text)
print(updated_images_with_text_df.head())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Ser 70 g (35g)

Ar < Per Serving

Calories

  
 

rots oat 26%
Prete! ccoorehysdrate 48g 17%
34%

 

cr titta Sgaby et 2.000 cancctes =

 

 

Net won int Jog

MR
On =

  
  
     
 
 
  
   
  

Se ne rn
ee ee

er
GS See tra oe aie

    
   
      
  
 
 
 
 
 
  
    

 

Manufactured and Marketed By-
Rajsudha Enterprises

NA/A. Jay Ambe Industral Estate, Pi
Bhiwandi. Thane - 421517

For consumer complaints/feedba:
indy contact our Customer Care Ny
8169500665/9975773626 or write
tanawadessmartfood@gm:
Facbook, Twitter, Instagram
www.tanawadefoods.com

Follow Us on:

oe ©

Product Of India

@ so

SN

 

text Enjoy a Spicy
Kolhapuri Misal at Home!

® _))
Tenawade's

"Fayed

i, 19 Cook

* LHAPURI MISA
bh X WITH MATKI
\THENTIC
‘LHAPURI
STE

Svar nwt

0 @

 

  

text 48V15AH
BATTERIE AU LITHIUM NOUVELLE ENERGIE

Batterie au lithium haute performance 48V15AH. Conception
étanche avec enveloppe métallique, légére et facile 

In [None]:
updated_images_with_text_df.to_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text_cleaned_1002_2002.csv', index=False)

In [None]:
#@title SKLEARN IMPORTS


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression


from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split,RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint

In [None]:
#@title apply paddle ocr on train csv

input_csv_path = '/content/drive/MyDrive/train_with_text_cleaned.csv'
df = pd.read_csv(input_csv_path)

updated_data = []

for index, row in df.iterrows():
    image_path = row['downloaded_image_path']
    extracted_text = extract_text_from_image_paddle(image_path)

    updated_data.append({
        'image_link': row['image_link'],
        'downloaded_image_path': row['downloaded_image_path'],
        'group_id': row['group_id'],
        'entity_name': row['entity_name'],
        'entity_value': row['entity_value'],
        'extracted_text_from_paddle': extracted_text
    })
    if index % 10 == 0:
        print(f"Processed {index} images")


updated_df = pd.DataFrame(updated_data)
output_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/train_with_text_from_paddle.csv'
updated_df.to_csv(output_csv_path, index=False)

print(f"Processing complete. Data saved to '{output_csv_path}'.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2024/09/16 01:48:26] ppocr DEBUG: cls num  : 37, elapsed : 0.14196085929870605
[2024/09/16 01:48:29] ppocr DEBUG: rec_res num  : 37, elapsed : 2.10721755027771
[2024/09/16 01:48:29] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.29337120056152344
[2024/09/16 01:48:29] ppocr DEBUG: cls num  : 4, elapsed : 0.04809212684631348
[2024/09/16 01:48:29] ppocr DEBUG: rec_res num  : 4, elapsed : 0.3611264228820801
[2024/09/16 01:48:30] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.287325382232666
[2024/09/16 01:48:30] ppocr DEBUG: cls num  : 4, elapsed : 0.0182034969329834
[2024/09/16 01:48:31] ppocr DEBUG: rec_res num  : 4, elapsed : 0.8772702217102051
Processed 80 images
[2024/09/16 01:48:31] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.30934929847717285
[2024/09/16 01:48:32] ppocr DEBUG: cls num  : 9, elapsed : 0.06147003173828125
[2024/09/16 01:48:33] ppocr DEBUG: rec_res num  : 9, elapsed : 1.0336592197418213
[2024/09/16 01:48:33] ppo

In [None]:
updated_images_with_text_cleaned_csv_path = '/content/drive/MyDrive/train_with_text_cleaned.csv'
df = pd.read_csv(updated_images_with_text_cleaned_csv_path)

X = df[['cleaned_text', 'entity_name', 'group_id']]
y = df['entity_value']

In [None]:
y.shape
X.shape

(2001, 3)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500), 'cleaned_text'),
        ('entity', OneHotEncoder(handle_unknown='ignore'), ['entity_name']),
        ('group', OneHotEncoder(handle_unknown='ignore'), ['group_id'])
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X['cleaned_text'] = X['cleaned_text'].fillna('')
X_train['cleaned_text'] = X_train['cleaned_text'].fillna('')
X_test['cleaned_text'] = X_test['cleaned_text'].fillna('')

# i am using clasification model

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1:.4f}")

# print("Classification Report:")
# print(classification_report(y_test, y_pred))

F1 Score: 0.1313


In [None]:
print(y_pred.shape)
print(y_test.shape)
print("y test is: \n", y_test)


(401,)
(401,)
y test is: 
 1394          11.0 gram
353           24.0 watt
1334           2.0 gram
906       20.7 kilogram
1290           7.4 volt
             ...       
1285      0.15 kilogram
914     500.0 milligram
261          240.0 gram
535       15.0 kilogram
1924          350 pound
Name: entity_value, Length: 401, dtype: object


In [None]:
#@title Trying to Improve score

#here i am tryin to improve score by employeeing randomized CV
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'classifier__n_estimators': randint(10, 200),
            'classifier__max_depth': randint(5, 30)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20)
        }
    },
    # 'GradientBoosting': {
    #     'model': GradientBoostingClassifier(),
    #     'params': {
    #         'classifier__n_estimators': randint(50, 200),
    #         'classifier__learning_rate': [0.01, 0.1, 0.2, 0.5],
    #         'classifier__max_depth': randint(3, 20)
    #     }
    # },
    'SVC': {
        'model': SVC(),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20),
            'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'classifier__n_neighbors': randint(1, 30),
            'classifier__weights': ['uniform', 'distance']
        }
    }
}
for name, model_info in models.items():
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model_info['model'])
    ])

    search = RandomizedSearchCV(
        model_pipeline,
        param_distributions=model_info['params'],
        n_iter=20,
        scoring='f1_weighted',
        cv=5,
        random_state=42,
        n_jobs=-1
    )

    search.fit(X_train, y_train)

    y_pred = search.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\n{name} - Best Params: {search.best_params_}")
    print(f"F1 Score: {f1:.4f}")
    # print("Classification Report:")
    # print(classification_report(y_test, y_pred))




RandomForest - Best Params: {'classifier__max_depth': 23, 'classifier__n_estimators': 117}
F1 Score: 0.0597





LogisticRegression - Best Params: {'classifier__C': 78.47599703514607}
F1 Score: 0.0566




KeyboardInterrupt: 

In [None]:
#@title Trying to Improve score

models = {
    'SVC': {
        'model': SVC(),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20),
            'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'classifier__n_neighbors': randint(1, 30),
            'classifier__weights': ['uniform', 'distance']
        }
    }
}
for name, model_info in models.items():
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model_info['model'])
    ])

    search = RandomizedSearchCV(
        model_pipeline,
        param_distributions=model_info['params'],
        n_iter=20,
        scoring='f1_weighted',
        cv=5,
        random_state=42,
        n_jobs=-1
    )

    search.fit(X_train, y_train)

    y_pred = search.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\n{name} - Best Params: {search.best_params_}")
    print(f"F1 Score: {f1:.4f}")
    # print("Classification Report:")
    # print(classification_report(y_test, y_pred))

In [None]:
#@title TEST DF

test_df = pd.read_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/test.csv')
print(test_df.shape)
print(test_df.head())

(131187, 4)
   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   

  entity_name  
0      height  
1       width  
2      height  
3       depth  
4       depth  


In [None]:

# Path to the folder where images will be saved
save_folder_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/testImages0'

data = []

i = 0
for index, row in test_df.iterrows():
    image_link = row['image_link']
    group_id = row['group_id']
    entity_name = row['entity_name']
    # entity_value = row['entity_value']

    filename = f"{i}_{group_id}_{entity_name}"
    i += 1

    download_image(image_link, save_folder_path, filename)

    file_path = os.path.join(save_folder_path, filename + '.jpg')

    data.append({
        'image_link': image_link,
        'downloaded_image_path': file_path,
        'group_id': group_id,
        'entity_name': entity_name,
    })


    if index % 10 == 0:
        print(f"downloaded {index} images")


output_df = pd.DataFrame(data)
output_df.to_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/test_D_1000.csv', index=False)

print("Processing complete. Data saved to 'processed_images.csv'.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
downloaded 81200 images
downloaded 81210 images
downloaded 81220 images
downloaded 81230 images
downloaded 81240 images
downloaded 81250 images
downloaded 81260 images
downloaded 81270 images
downloaded 81280 images
downloaded 81290 images
downloaded 81300 images
downloaded 81310 images
downloaded 81320 images
downloaded 81330 images
downloaded 81340 images
downloaded 81350 images
downloaded 81360 images
downloaded 81370 images
downloaded 81380 images
downloaded 81390 images
downloaded 81400 images
downloaded 81410 images
downloaded 81420 images
downloaded 81430 images
downloaded 81440 images
downloaded 81450 images
downloaded 81460 images
downloaded 81470 images
downloaded 81480 images
downloaded 81490 images
downloaded 81500 images
downloaded 81510 images
downloaded 81520 images
downloaded 81530 images
downloaded 81540 images
downloaded 81550 images
downloaded 81560 images
downloaded 81570 images
downloaded 81580 images

In [None]:
from multiprocessing import Pool, cpu_count

def process_image_row(row):
    image_path = row['downloaded_image_path']
    extracted_text = extract_text_from_image(image_path)

    return {
        'image_link': row['image_link'],
        'downloaded_image_path': row['downloaded_image_path'],
        'group_id': row['group_id'],
        'entity_name': row['entity_name'],
        'extracted_text': extracted_text
    }

def track_progress(index, total):
    print(f"Processed {index} out of {total} images")

def process_images_extract_text_and_save_for_test_Multiprocessing(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)

    df = df.head(1001)
    data_list = df.to_dict('records')
    total_rows = len(data_list)

    num_processes = cpu_count()

    with Pool(processes=num_processes) as pool:
        updated_data = []
        for index, result in enumerate(pool.imap(process_image_row, data_list), 1):
            updated_data.append(result)
            if index % 10 == 0:
                track_progress(index, total_rows)

    updated_df = pd.DataFrame(updated_data)
    updated_df.to_csv(output_csv_path, index=False)

    print(f"Processing complete. Data saved to '{output_csv_path}'.")


In [None]:
input_test_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/test_D_1000.csv'
output_test_csv_path = '/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/test_D_1000_with_text.csv'
process_images_extract_text_and_save_for_test_Multiprocessing(input_test_csv_path,output_test_csv_path)

Processed 10 out of 1001 images
Processed 20 out of 1001 images
Processed 30 out of 1001 images
Processed 40 out of 1001 images
Processed 50 out of 1001 images
Processed 60 out of 1001 images
Processed 70 out of 1001 images
Processed 80 out of 1001 images
Processed 90 out of 1001 images
Processed 100 out of 1001 images
Processed 110 out of 1001 images
Processed 120 out of 1001 images
Processed 130 out of 1001 images
Processed 140 out of 1001 images
Processed 150 out of 1001 images
Processed 160 out of 1001 images
Processed 170 out of 1001 images
Processed 180 out of 1001 images
Processed 190 out of 1001 images
Processed 200 out of 1001 images
Processed 210 out of 1001 images
Processed 220 out of 1001 images
Processed 230 out of 1001 images
Processed 240 out of 1001 images
Processed 250 out of 1001 images
Processed 260 out of 1001 images
Processed 270 out of 1001 images
Processed 280 out of 1001 images
Processed 290 out of 1001 images
Processed 300 out of 1001 images
Processed 310 out o

In [None]:

# updated_images_with_text_df = pd.read_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/updated_images_with_text.csv')
updated_images_with_text_df = pd.read_csv(output_test_csv_path)
print(updated_images_with_text_df.head())

updated_images_with_text_df['cleaned_text'] = updated_images_with_text_df['extracted_text'].apply(preprocess_text)
print(updated_images_with_text_df.head())


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

 

—_—O
aiin/ 300m

text  

text  

text  

text  

text  

text 7.19cm/
2.83inch

J

5.08cm/2inch

text 7.19cm/
2.83inch

J

5.08cm/2inch

text 2.7 in / 7.0cm

 

11.0 in / 28.0 cm

text 2.7 in / 7.0cm

 

11.0 in / 28.0 cm

text i

oa oles

 

text 40cm

6icm

text 40cm

6icm

text Ce
Celts

 

 

PU aera ME

text 20 cm

uo S'£2T
ee

 

text 
text 
text A40lb

WEIGHT LIMIT

 

text 33grams.

<—Botte ameter 12mm theead 1/2-20 UNF

der

@
Jv

text 
text 37.5cm/14.76in

 

text 37.5cm/14.76in

 

text 37.5cm/14.76in

 

text 10 in

\ in

13.2 in

 

text 10 in

\ in

13.2 in

 

text oe ae

 

With rapid air trying technology. you
can anjoy a faster and healthier
frying experience wth litle to na cil

Ped
rer

 

Peet)
ESET d

 

text oe ae

 

With rapid air trying technology. you
can anjoy a faster and healthier
frying experience wth litle to na cil

Ped
rer

 

Peet)
ESET d

 

text  

text 

In [None]:
# updated_images_with_text_cleaned_csv_path = '/content/drive/MyDrive/train_with_text_cleaned.csv'
df = updated_images_with_text_df

X_test = df[['cleaned_text', 'entity_name', 'group_id']]

In [None]:

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X['cleaned_text'] = X['cleaned_text'].fillna('')
# X_train['cleaned_text'] = X_train['cleaned_text'].fillna('')
X_test['cleaned_text'] = X_test['cleaned_text'].fillna('')

y_pred = model.predict(X_test)


# print("Classification Report:")
# print(classification_report(y_test, y_pred))

predictions_df = pd.DataFrame({
    'index': X_test.index,
    'prediction': y_pred
})

predictions_df.to_csv('/content/drive/MyDrive/AMZN ML Challenge/student_resource 3/dataset/test_out.csv', index=False, header=['index', 'prediction'])

print("Predictions saved to 'predictions.csv'")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['cleaned_text'] = X_test['cleaned_text'].fillna('')


Predictions saved to 'predictions.csv'
