In [3]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


dataset = pd.read_csv("train.csv")

# Apply one-hot encoding on 'entity_name' column (before dropping it)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
x_encoded = ct.fit_transform(dataset)

# Convert encoded data to DataFrame
x_encoded_df = pd.DataFrame(x_encoded)

# Drop original 'entity_name' column from the dataset
dataset_drop_entity = dataset.drop(columns=['entity_name'])

# Get the one-hot encoded column names
encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

# Create DataFrame for one-hot encoded columns
onehot_cols = pd.DataFrame(x_encoded[:, :len(encoded_column_names)], columns=encoded_column_names)

# Concatenate original columns (excluding 'entity_name') and one-hot encoded columns
dataset_onehot = pd.concat([dataset_drop_entity, onehot_cols], axis=1)

# Move 'entity_value' column to the last position
cols = [col for col in dataset_onehot.columns if col != 'entity_value']  # All columns except 'entity_value'
cols.append('entity_value')  # Add 'entity_value' at the end

# Reorder the DataFrame
dataset_onehot = dataset_onehot[cols]

# Show the first few rows
print(dataset_onehot.head())


                                          image_link  group_id  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432   

  entity_name_depth entity_name_height entity_name_item_volume  \
0               0.0                0.0                     0.0   
1               0.0                0.0                     1.0   
2               0.0                0.0                     0.0   
3               0.0                0.0                     0.0   
4               0.0                0.0                     0.0   

  entity_name_item_weight entity_name_maximum_weight_recommendation  \
0                     1.0                                       0.0   
1                     0.0                                       

In [5]:
print(dataset_onehot.columns)

Index(['image_link', 'group_id', 'entity_name_depth', 'entity_name_height',
       'entity_name_item_volume', 'entity_name_item_weight',
       'entity_name_maximum_weight_recommendation', 'entity_name_voltage',
       'entity_name_wattage', 'entity_name_width', 'entity_value'],
      dtype='object')


In [7]:
dataset_onehot.to_csv("rearranged_dataset2.csv", index=False)

In [8]:
import pandas as pd

# Load your dataset
df = pd.read_csv('rearranged_dataset2.csv')

# Split entity_value into numeric and unit
df[['value', 'unit']] = df['entity_value'].str.extract(r'(\d+\.?\d*)\s*(\D+)')
df = df.drop(columns=['entity_value'])


df.to_csv('updated_file.csv', index=False)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [9]:
import pandas as pd

# Load your dataset
df = pd.read_csv('updated_file.csv')

# Factorize the 'unit' column to assign numeric labels
df['unit_label'] = pd.factorize(df['unit'])[0]

# Save the modified DataFrame (optional)
df.to_csv('updated_file_with_unit_labels.csv', index=False)

# Save the unit label mapping
unit_mapping = pd.factorize(df['unit'])[1]

import pickle
with open('unit_mapping.pkl', 'wb') as file:
    pickle.dump(unit_mapping, file)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [10]:
dataset=pd.read_csv('updated_file_with_unit_labels.csv')

In [12]:
X=dataset.iloc[:, 1:10]
y=dataset.iloc[:, -1]

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [14]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

In [15]:
y_pred=classifier.predict(x_test)

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[11686     0   764 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [  607     0   990 ...     0     0     0]
 ...
 [    5     0     0 ...     0     0     0]
 [    0     0     0 ...     0    29     0]
 [    0     0     0 ...     0     0     0]]


0.6143977867050708

In [17]:
import pickle

# Save the encoder (ColumnTransformer) after training
with open('encoder.pkl', 'wb') as file:
    pickle.dump(ct, file)

# Save the trained SVM model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

## add the group_id and entity_name in this at last

In [29]:
import pandas as pd
import pickle

# Load the saved encoder, SVM model, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Function to make predictions based on manual input
def predict_unit(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input, including all required columns
    manual_input = {
        'group_id': [group_id_value],       # Use group_id here
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],               # Add placeholder values for missing columns
        'product_id': [None]                # Add placeholder values for missing columns
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)

    # Get the encoded column names (same as training)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Create DataFrame for encoded entity_name with proper column names
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)

    # Combine 'group_id' and the encoded columns
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])

    # Prepare the final input row for prediction by combining 'group_id' and encoded columns
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)

    # Predict the unit label using the classifier
    predicted_unit_label = classifier.predict(input_data)

    # Map the predicted numeric label to the original unit name
    predicted_unit = unit_mapping[predicted_unit_label[0]]

    return predicted_unit

# Example usage
if __name__ == "__main__":
    group_id_input = 281678  # Replace with your input
    entity_name_input = 'item_weight'  # Replace with your input

    predicted_unit = predict_unit(group_id_input, entity_name_input)
    print(f"Predicted unit is {predicted_unit}")


Predicted unit is gram


In [73]:
type(predicted_unit)

str

In [75]:
import cv2 
import pytesseract

In [77]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'

## Add the link to the image here

In [79]:
import requests

def download_image(image_url):
    save_path = "test_image.jpg"  # Constant save path
    try:
        # Send a GET request to fetch the image
        response = requests.get(image_url)
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Write the image content to a file
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print(f"Image successfully downloaded: {save_path}")
        else:
            print(f"Failed to retrieve image. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
image_url = 'https://m.media-amazon.com/images/I/81dzao1Ob4L.jpg'  # Replace with your image URL
download_image(image_url)


Image successfully downloaded: test_image.jpg


In [80]:
import cv2
img = cv2.imread("test_image.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Apply Gaussian Blur to remove noise
gray = cv2.GaussianBlur(gray, (5, 5), 0)

# Apply Otsu's thresholding after Gaussian filtering
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Use dilation to strengthen the text and erode to remove noise
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
dilate = cv2.dilate(thresh, kernel, iterations=1)
erode = cv2.erode(dilate, kernel, iterations=1)

# Resize the image
img_resized = cv2.resize(thresh, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

edges = cv2.Canny(gray, 100, 200)

import pytesseract
from PIL import Image

# Convert the OpenCV image to a format that Tesseract can understand
preprocessed_image = Image.fromarray(thresh)

# Extract text using Tesseract
text = pytesseract.image_to_string(preprocessed_image)

print(text)


PACKAGING CHANGED

a
=
orien
Comary bitin |
=
== =
es

pel = ee
SaaS. SERS

BEFORE

ae he wore witht ood yu 0 of he herb keeps blood pressure «conrad bobo keeps
‘We wark with o wnble network of fermen ond hormond balance

DIRECTION OF USE
Sem medcing herb spearmint con be mode into herbal

Meee tie etn

‘Coetnn 1 Paden |
Schtemthy topsite pete oop

Hor . Neda. GOO same wihin 2 eesthe chor opening th packet
Fer cotemser feedback a NET WEIGHT : 100 g (3.53 oz]
Sagres ay Bit
eoreverichengonics crm A] MRP Rs. :
i lendboak com serichengrnics/ .
@ bechagrencom/ erick ovyenica/| BATCH NO. :

dhnigp 0 puri even, .
Merten tcetin genres PRD:

E r
lb 8 jE
tila USE BY :

AFTER

Note: Packaging may vary for a brief period of time.



In [81]:
type(text)

str

In [97]:
import random
# Extract all numbers (integer and float) from the string
numbers = re.findall(r'\d+\.?\d*', text)

# Convert them to float
numbers = [float(num) for num in numbers]

# Randomly select one number
selected_number = random.choice(numbers)

# Concatenate with your predicted unit
result = f"{selected_number} {predicted_unit}"



# Result

In [1]:
import requests
from PIL import Image
from io import BytesIO
import numpy as np
import cv2
import pytesseract
import re

def process_and_extract_integers(image_url):
    """
    Downloads an image from the provided URL, preprocesses it (resizes and converts to grayscale),
    and then extracts all integer values from the image using OCR. If no integers are found, it returns None.
    """
    try:
        # Step 1: Download the image
        response = requests.get(image_url)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
        else:
            raise Exception(f"Failed to download image. Status code: {response.status_code}")
        
        # Step 2: Preprocess the image
        # Convert the Pillow image to a NumPy array
        image_np = np.array(img)

        # Convert from RGB to BGR for OpenCV
        image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

        # Resize the image to 500x500 pixels
        image_resized = cv2.resize(image_bgr, (500, 500))

        # Convert to grayscale
        gray_image = cv2.cvtColor(image_resized, cv2.COLOR_BGR2GRAY)

        # Step 3: Extract integers using OCR
        # Perform OCR on the grayscale image
        text = pytesseract.image_to_string(gray_image)

        # Define a pattern to find integers
        integer_pattern = r'\d+'

        # Find all integers in the OCR extracted text
        matches = re.findall(integer_pattern, text)

        # Convert matches to integers
        integers = [int(match) for match in matches]

        if integers:
            # Calculate and return the average of the integers
            return sum(integers) / len(integers)
        else:
            # Return None if no integers are found
            return None

    except Exception as e:
        print(f"Error processing image: {e}")
        return None

In [31]:
def process_csv(input_csv_path, output_csv_path):
    input_data = pd.read_csv(input_csv_path)
    output_data = []

    for idx,row in input_data.iterrows():
        group_id = row['group_id']
        entity_name = row['entity_name']

        # extracted_value = process_and_extract_integers(image_url)
        # if extracted_value is None:
        #     output_data.append({"index": row['index'], "prediction": None})
        #     continue
        
        prediction = predict_unit(group_id, entity_name)
        output_data.append({"index": row['index'], "prediction": prediction})

    output_df = pd.DataFrame(output_data)
    output_df.to_csv(output_csv_path, index=False)
    print(f"Processing complete. Results saved to {output_csv_path}")

input_csv_path = 'test.csv'
output_csv_path = 'test_out.csv'
            
process_csv(input_csv_path, output_csv_path)

Processing complete. Results saved to test_out.csv
