<a href="https://colab.research.google.com/github/Priyojit02/kaggle-stumbleupon-stylumia/blob/master/notebooka4185c31b2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'extracting-attributes-from-fashion-images-jan-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F67553%2F7504710%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240318%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240318T165053Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1e3c953f14bc3acdffcce0a817edf27a76c47b95cb7616b69416a13be83c0df333c25d2b78322a89fc619b8897cbceb4a294936e969ab6343a3ea6535aa3afee1b65606195298a2bb41b1ad301a80022066e60d6a94055956c9d6e8936399b7567700794806cdbfbbe3a2ca636969defade01f35cd10dffab6e70a137d810368bd108ccab32d786a6a6ced5ccf1e9cd0bedb832b9f2fe3b85374164b0aa2a6749578046282ac2c53d31e3550b27b0d4bfcd56097cd454eaa1b5354e8aff093913c39b06b4ba0a054b518aeee8cf275ec01d4b1ad7a08480e23cf80c64b28f926220775f170958cef1b0b738c2a47a766cc7822ff520ffc6b93aab56ec496d122'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import os
import cv2
import pandas as pd

# Function to load data from a given directory
def load_data(directory_path, csv_path):
    df = pd.read_csv(csv_path)

    X = []  # List to store images
    y = []  # List to store labels

    if not os.path.exists(directory_path):
        raise ValueError(f"The specified directory '{directory_path}' does not exist.")

    for i in range(df.shape[0]):
        img_path = os.path.join(directory_path, df.iloc[i]['file_name'])
        label = df.iloc[i]['label']

        # Read and preprocess the image (resize and convert to RGB format)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img_resized = cv2.resize(img, (180, 180))

        # Convert to RGB format by duplicating the single channel
        img_resized = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)

        X.append(img_resized)
        y.append(label)

    if not X or not y:
        raise ValueError(f"No images found in the specified directory '{directory_path}'.")

    return np.array(X), np.array(y)

# Load data
data_directory = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/train'
csv_path = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/train.csv'
X, y = load_data(data_directory, csv_path)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    preprocessing_function=tf.keras.applications.resnet50.preprocess_input
)

# Create ResNet50-based model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(180, 180, 3))

# Fine-tuning: Unfreeze some layers for training
for layer in base_model.layers[:-10]:
    layer.trainable = True

model = models.Sequential()
model.add(base_model)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))

# Additional features
model.add(layers.Dense(64, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.2))

# Output layer
num_classes = len(set(y_train))
model.add(layers.Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Learning rate scheduling
lr_scheduler = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6)

# Model checkpointing
checkpoint_filepath = '/kaggle/working/'
model_checkpoint = callbacks.ModelCheckpoint(
    checkpoint_filepath,
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

# Train the model with data augmentation
datagen.fit(X_train)  # Fit the ImageDataGenerator on your training data (only for normalization purposes)
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    epochs=30,
    validation_data=(X_test, y_test),
    callbacks=[lr_scheduler, model_checkpoint]
)

# Load the best model
model = tf.keras.models.load_model(checkpoint_filepath)

# Make predictions on the test set
predictions = model.predict(X_test)
predicted_labels = tf.argmax(predictions, axis=1).numpy()

# Evaluate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f'Accuracy: {accuracy}')


In [None]:
abs_train_path = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/train.csv'
abs_train_directory = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/train'
abs_test_path = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/sample_submission.csv'
abs_test_directory = '/kaggle/input/extracting-attributes-from-fashion-images-jan-2024/test'

df2 = pd.read_csv(abs_test_path)

X_test = []

# Resize and convert to grayscale for the test set
for i in range(df2.shape[0]):
    img = cv2.imread(os.path.join(abs_test_directory, df2.iloc[i]['file_name']), cv2.IMREAD_GRAYSCALE)

    img_resized = cv2.resize(img, (180, 180))

    # Convert to RGB format by duplicating the single channel
    img_resized = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
    if i%100 ==0:
        print(i)
    X_test.append(img_resized)
    if i % 100 == 0:
        print(i)

X_test_np = np.array(X_test)


In [None]:
predictions = model.predict(X_test_np)
predicted_labels = tf.argmax(predictions, axis=1).numpy()

In [None]:
import pandas as pd

# Create a DataFrame with the file_name and label columns
df_output = pd.DataFrame({'file_name': df2["file_name"], 'label': predicted_labels})

# Save the DataFrame to a CSV file
df_output.to_csv('output4.csv', index=False)