In [3]:
import os
import zipfile

# Path to the zip file
zip_file_path = '/content/archive.zip'

# Extract the zip file
extract_dir = 'extracted_data'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Inspect the extracted files
print(os.listdir(extract_dir))


['student_resource 3']


In [6]:
import cv2
import pandas as pd
import numpy as np
import os

# Import the download_images function (adjust the path if needed)
from extracted_data.student_resource_3.src.utils import download_images

# Load the train and test data
train_df = pd.read_csv('/content/extracted_data/student_resource_3/dataset/train.csv')
test_df = pd.read_csv('/content/extracted_data/student_resource_3/dataset/test.csv')

# Select the first 2500 rows for processing
train_df = train_df.head(200)
test_df = test_df.head(200)

# Create the directory for images if it doesn't exist
image_dir = 'images'
os.makedirs(image_dir, exist_ok=True)

# Download images and store paths (for the first 2500 images)
train_df['image_path'] = train_df['image_link'].apply(lambda x: download_images(x, image_dir))
test_df['image_path'] = test_df['image_link'].apply(lambda x: download_images(x, image_dir))

# Preprocess images
def preprocess_image(image_path, img_size=(224, 224)):
    image = cv2.imread(image_path)
    if image is None:  # Handle missing images
        return np.zeros((224, 224, 3))  # Return a blank image if the download failed
    image = cv2.resize(image, img_size)
    image = image / 255.0  # Normalize to [0, 1]
    return image

# Apply preprocessing to the first 2500 images
train_df['processed_image'] = train_df['image_path'].apply(preprocess_image)
test_df['processed_image'] = test_df['image_path'].apply(preprocess_image)

# Check the results for the first 5 rows
print(train_df[['image_path', 'processed_image']].head())
print(test_df[['image_path', 'processed_image']].head())


100%|██████████| 51/51 [00:11<00:00,  4.40it/s]
100%|██████████| 51/51 [00:12<00:00,  4.06it/s]
100%|██████████| 51/51 [00:10<00:00,  4.86it/s]
100%|██████████| 51/51 [00:11<00:00,  4.27it/s]
100%|██████████| 51/51 [00:11<00:00,  4.55it/s]
100%|██████████| 51/51 [00:10<00:00,  4.90it/s]
100%|██████████| 51/51 [00:11<00:00,  4.43it/s]
100%|██████████| 51/51 [00:11<00:00,  4.60it/s]
100%|██████████| 51/51 [00:10<00:00,  4.91it/s]
100%|██████████| 51/51 [00:11<00:00,  4.44it/s]
100%|██████████| 51/51 [00:11<00:00,  4.53it/s]
100%|██████████| 51/51 [00:10<00:00,  4.92it/s]
100%|██████████| 51/51 [00:11<00:00,  4.43it/s]
100%|██████████| 51/51 [00:11<00:00,  4.61it/s]
100%|██████████| 51/51 [00:10<00:00,  4.97it/s]
100%|██████████| 51/51 [00:11<00:00,  4.47it/s]
100%|██████████| 51/51 [00:11<00:00,  4.60it/s]
100%|██████████| 51/51 [00:10<00:00,  4.99it/s]
100%|██████████| 51/51 [00:11<00:00,  4.45it/s]
100%|██████████| 51/51 [00:10<00:00,  4.64it/s]
100%|██████████| 51/51 [00:10<00:00,  4.

  image_path                                    processed_image
0       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
1       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
2       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
3       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
4       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
  image_path                                    processed_image
0       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
1       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
2       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
3       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
4       None  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...


In [8]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Convert entity_value to a float value (e.g., '34 gram' to 34.0)
# Handle potential issues in entity_value format
train_df['entity_value'] = train_df['entity_value'].apply(lambda x: float(x.split()[0].replace('[', '').replace(',', '')))

# Train-test split
X = np.array(train_df['processed_image'].tolist())
y = train_df['entity_value'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained EfficientNetB0
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze base model layers
base_model.trainable = False

# Add custom layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='linear')(x)

# Build the model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 6s/step - loss: 220454.3906 - mae: 172.5016 - val_loss: 101256.4375 - val_mae: 174.0368
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step - loss: 131581.1250 - mae: 143.8373 - val_loss: 100480.3672 - val_mae: 172.2880
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3s/step - loss: 448782.9375 - mae: 240.4293 - val_loss: 99583.7578 - val_mae: 170.4461
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step - loss: 201535.5469 - mae: 183.5699 - val_loss: 98587.4219 - val_mae: 168.5580
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3s/step - loss: 187981.9688 - mae: 186.1573 - val_loss: 97477.6016 - val_mae: 166.6005
Epoch 6/20


<keras.src.callbacks.history.History at 0x792f8ba96890>

In [9]:
# Unfreeze some layers of the base model for fine-tuning
base_model.trainable = True

# Compile with a low learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mae'])

# Fine-tune the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)


Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 10s/step - loss: 455078.2500 - mae: 240.8071 - val_loss: 71660.9844 - val_mae: 168.8307
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 10s/step - loss: 191150.4531 - mae: 201.1358 - val_loss: 72345.0000 - val_mae: 167.1862
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 10s/step - loss: 308650.1250 - mae: 214.8815 - val_loss: 73361.1406 - val_mae: 165.3987
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 10s/step - loss: 218654.7031 - mae: 202.2210 - val_loss: 74203.5312 - val_mae: 164.1976
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 9s/step - loss: 178995.2812 - mae: 187.9624 - val_loss: 75096.0781 - val_mae: 163.0875
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 9s/step - loss: 249322.1094 - mae: 206.3413 - val_loss: 75941.8516 - val_mae: 162.1429
Epoch 7/20
[1m5/5[0m [32m━

<keras.src.callbacks.history.History at 0x79301b53dd50>

In [10]:
# Predict on the test set
X_test = np.array(test_df['processed_image'].tolist())
predictions = model.predict(X_test)

# Map predictions to appropriate units from constants.py
allowed_units = {
    "width": "centimetre", "height": "metre", "item_weight": "gram", "item_volume": "litre", "entity_name": "gram", "entity_value": "gram", "image_link": "image_link", "image_path": "image_path", "processed_image": "processed_image", "prediction": "prediction", "index": "index", "image_name": "image_name"
}

# Format predictions
test_df['prediction'] = [f"{pred[0]:.2f} {allowed_units['entity_name']}" for pred in predictions]

# Save predictions to a CSV file
output = test_df[['index', 'prediction']]
output.to_csv('test_out.csv', index=False)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step


In [11]:
!python /content/extracted_data/student_resource_3/src/sanity.py test_out.csv


Traceback (most recent call last):
  File "/content/extracted_data/student_resource_3/src/sanity.py", line 6, in <module>
    from utils import parse_string
  File "/content/extracted_data/student_resource_3/src/utils.py", line 2, in <module>
    from extracted_data.student_resource_3.src import constants
ModuleNotFoundError: No module named 'extracted_data'


In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Assume you already have preprocessed data in train_df and test_df
# Split data into features (processed images) and target (entity_value)
X = np.stack(train_df['processed_image'].values)
y = train_df['entity_value'].values  # Assuming these are numeric values like weights, dimensions, etc.

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize target variable (entity_value) to [0, 1] for regression
y_train = (y_train - np.min(y_train)) / (np.max(y_train) - np.min(y_train))
y_val = (y_val - np.min(y_val)) / (np.max(y_val) - np.min(y_val))

# Load a pre-trained model (e.g., EfficientNet or ResNet)
base_model = tf.keras.applications.EfficientNetB0(include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze the base model

# Build a simple model
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val, y_pred)

# Scale accuracy from 0 to 1 (1 means perfect prediction)
max_mae = np.max(y_val) - np.min(y_val)  # The maximum possible error
scaled_accuracy = 1 - (mae / max_mae)

print(f'Scaled accuracy: {scaled_accuracy:.4f}')


Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3s/step - loss: 0.0255 - mae: 0.1153 - val_loss: 0.0813 - val_mae: 0.2122
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2s/step - loss: 0.0256 - mae: 0.0949 - val_loss: 0.0385 - val_mae: 0.1164
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3s/step - loss: 0.0130 - mae: 0.0792 - val_loss: 0.0423 - val_mae: 0.1141
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3s/step - loss: 0.0063 - mae: 0.0433 - val_loss: 0.0529 - val_mae: 0.1288
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step - loss: 0.0081 - mae: 0.0388 - val_loss: 0.0422 - val_mae: 0.1141
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3s/step
Scaled accuracy: 0.8859
