In [3]:
import sys
import os
import pandas as pd

# Get the parent directory where the src folder is located
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path if it's not already there
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now import the functions and constants from src.Files
from src.utils import download_images
from src.constants import allowed_units

In [4]:
# Load the training data
train_df = pd.read_csv('/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/dataset/train.csv')


In [5]:
# Display the first few rows of the dataset to understand its structure
print(train_df.head())


                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value  
0      500.0 gram  
1         1.0 cup  
2      0.709 gram  
3      0.709 gram  
4  1400 milligram  


In [6]:
# Step 1: Data Preprocessing
# Convert entity_value to lowercase to standardize the format
train_df['entity_value'] = train_df['entity_value'].str.lower()


In [10]:
# Basic unit cleaning - extract numeric values and units
import re

def extract_value_and_unit(entity_value):
    if pd.isna(entity_value):
        return None, None

    # Use regex to extract numbers and words separately
    parts = re.findall(r'[\d.]+|\w+', entity_value)

    if len(parts) < 2:
        return None, None

    try:
        value = float(parts[0])  # Convert the first part to a float
        unit = ' '.join(parts[1:])  # Join the remaining parts as the unit
        return value, unit
    except ValueError:
        return None, None

# Apply the function to the entity_value column
train_df['value'], train_df['unit'] = zip(*train_df['entity_value'].apply(extract_value_and_unit))

In [44]:
import os
import requests
import pandas as pd
from urllib.parse import urlparse

# Define the function to download a single image
def download_image(url, download_folder):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check for HTTP errors
        
        filename = os.path.join(download_folder, url.split("/")[-1])
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {url}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

def is_valid_url(url):
    """Check if the URL is valid and not empty."""
    try:
        parsed_url = urlparse(url)
        return all([parsed_url.scheme, parsed_url.netloc])
    except Exception:
        return False

if __name__ == "_main_":
    # Load your DataFrame here (adjust the path as needed)
    # Example: train_df = pd.read_csv('path_to_your_csv_file.csv')
    
    # Define the folder to save images
    download_folder = '/Users/pragunisanotra/Desktop/Amazon ML Challenge/images'

    # Create the folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # Access the 'image_link' column
    urls = train_df['image_link']

    # Filter out invalid URLs
    valid_urls = [url for url in urls if is_valid_url(url)]

    # Download images one by one
    for url in valid_urls:
        download_image(url, download_folder)

In [45]:
# Step 3: Model Training
# Example: A simple heuristic model based on statistical methods
# (Replace with a machine learning model as required)

# Create a mapping from entity_name to the most common unit in training data
entity_unit_mapping = train_df.groupby('entity_name')['unit'].agg(lambda x: x.value_counts().idxmax())

# Save mapping to use during prediction
entity_unit_mapping.to_csv('output/entity_unit_mapping.csv')



In [46]:
import pandas as pd

# Define the data to insert into the validation.csv file with local file paths
data = {
    'image_link': [
        '/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/train_images/image_1.jpg',
        '/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/train_images/image_2.jpg',
        '/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/train_images/image_0.jpg'
    ],
    'entity_value': [
        '50 gram',
        '100 gram',
        '75 gram'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Path to the validation file
validation_file_path = '/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/dataset/validation.csv'

# Save the DataFrame to a CSV file
df.to_csv(validation_file_path, index=False)

print(f"Validation file created at {validation_file_path} with sample data.")

Validation file created at /Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/dataset/validation.csv with sample data.


In [64]:
import pandas as pd
import numpy as np  # Add this import for numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight

# Load your validation data
validation_file_path = '/Users/pragunisanotra/Desktop/Amazon ML Challenge/notebook/dataset/validation.csv'
df = pd.read_csv(validation_file_path)

# Preprocess the data
le = LabelEncoder()
df['entity_value'] = le.fit_transform(df['entity_value'])

# Check for missing or invalid data in 'image_link' or 'entity_value'
df = df.dropna(subset=['image_link', 'entity_value'])  # Remove rows with missing values

# Feature Engineering: Using length of image_link as a dummy feature for now
# Note: Replace this with more meaningful features in the future
X = df['image_link'].str.len().values.reshape(-1, 1)  # Dummy feature
y = df['entity_value']

# Handle class imbalance by computing class weights based on the encoded labels
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a model with balanced class weights
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)

print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

F1 Score: 0.00
Precision: 1.00
Recall: 0.00


In [65]:
import joblib
import json
import os

# Define the trained model
model = None  # Replace with your actual trained model

# Save the trained model
joblib.dump(model, 'path_to_save_your_model.pkl')

# Define and save entity-unit mapping
entity_unit_mapping = {
    'item_weight': 'gram',
    'length': 'centimetre',
    # Add other mappings as needed
}

with open('entity_unit_mapping.json', 'w') as f:
    json.dump(entity_unit_mapping, f)

print("Training complete. Artifacts saved.")

Training complete. Artifacts saved.
