### Preparation des datasets

In [1]:
import pandas as pd

df_text_test_embeddings = pd.read_csv('final_text_test_embeddings_flattened.csv', index_col=0)

In [7]:
df_test_identifiers = df_text_test_embeddings[['imageid', 'productid']]

In [8]:
# Chargement des fichiers "X_train_uptade.csv" et "Y_trainCVw08PX.csv"
df_1 = pd.read_csv('X_train.csv', index_col=0)
df_2 = pd.read_csv('y_train.csv', index_col=0)

# Fusion avec merge des deux datasets
df_classes = pd.merge(df_1, df_2, left_index = True, right_index = True)

df_classes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
 4   prdtypecode  84916 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 5.9+ MB


In [9]:
df_classes = df_classes[['imageid', 'productid', 'prdtypecode']]

In [11]:
# création de Nom Image et lien
df_classes['Nom image'] = ['image_' + str(imageid) + '_product_' + str(productid) + '.jpg' for imageid, productid in zip(df_classes['imageid'], df_classes['productid'])]

In [12]:
path = './datasets/images_train_upscalled'
df_classes['lien'] = str(path) + '/' + df_classes['prdtypecode'].astype(str)+ '/' + df_classes['Nom image']

In [13]:
# Fusionner df_classes avec df_test_identifiers en utilisant la colonne 'imageid' pour récupérer 'prdtypecode'
df_classes_test = df_classes.merge(df_test_identifiers['imageid'], on='imageid', how='inner')

In [14]:
df_classes_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16984 entries, 0 to 16983
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   imageid      16984 non-null  int64 
 1   productid    16984 non-null  int64 
 2   prdtypecode  16984 non-null  int64 
 3   Nom image    16984 non-null  object
 4   lien         16984 non-null  object
dtypes: int64(3), object(2)
memory usage: 663.6+ KB


In [15]:
# Ajout du mapping des classes pour les images
classe_images = pd.read_csv('class_images_mapping.csv')
classe_images.rename(columns={'Class Name': 'prdtypecode', 'label': 'class_image'}, inplace=True)

In [16]:
classe_images

Unnamed: 0,prdtypecode,Label
0,10,0
1,1140,1
2,1160,2
3,1180,3
4,1280,4
5,1281,5
6,1300,6
7,1301,7
8,1302,8
9,1320,9


### Preparation des images 

In [17]:
import shutil
import os

# On ajuste le lien au nouveau lien pour les données de test
df_classes_test['lien_test'] = df_classes_test['lien'].str.replace('images_train_upscalled', 'images_test_upscalled')

# On créer les dossiers et fichiers pour le test !!!! (si déjà fait pas besoin de re-run ce code)
def copy_images(row):
    os.makedirs(os.path.dirname(row['lien_test']), exist_ok=True)
    shutil.copy(row['lien'], row['lien_test'])

# on applique la fonction pour chaque ligne du dataframe
df_classes_test.apply(copy_images, axis=1)
df_classes_test.to_csv("./final_image_test_dataset.csv") 


### Chargement du model

In [18]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import ViTForImageClassification, ViTConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# load du model
def load_model(filepath, device='cpu'):
    device = torch.device(device)
    config = ViTConfig.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=27, output_hidden_states=True)
    model = ViTForImageClassification(config)
    try:
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['state_dict'])
    except KeyError:
        model.load_state_dict(torch.load(filepath, map_location=device))
    model.to(device)
    return model

model = load_model("final_model_image.pth")

In [20]:
dataset_path = "C:/Users/tgp/Documents/kaggle/Rakuten/Github_final/4_Prediction_catégories/datasets/images_test_upscalled"
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
test_dataset = datasets.ImageFolder(dataset_path, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [21]:
def get_embeddings_and_predictions(model, dataloader):
    model.eval()
    embeddings, predictions, paths = [], [], []
    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(dataloader):
            images = images.to(device)
            outputs = model(images)
            embeddings.extend(outputs.hidden_states[-1][:, 0, :].cpu().numpy())  # Extract the CLS token embeddings from last hidden state
            _, preds = torch.max(outputs.logits, 1)
            predictions.extend(preds.cpu().numpy())
            start_index = batch_idx * dataloader.batch_size
            # récupération des chemins de chaque image (pour les utiliser comme identifiers)
            batch_paths = [dataloader.dataset.samples[i][0] for i in range(start_index, start_index + len(labels))]
            paths.extend(batch_paths)
    return embeddings, predictions, paths

embeddings, predictions, paths = get_embeddings_and_predictions(model, test_loader)

embeddings_df = pd.DataFrame(embeddings)
embeddings_df['predictions'] = predictions
embeddings_df['path'] = paths

embeddings_df.info()
embeddings_df.to_csv("final_image_test_embeddings.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16984 entries, 0 to 16983
Columns: 770 entries, 0 to path
dtypes: float32(768), int64(1), object(1)
memory usage: 50.0+ MB


In [22]:
embeddings_df.columns

Index([            0,             1,             2,             3,
                   4,             5,             6,             7,
                   8,             9,
       ...
                 760,           761,           762,           763,
                 764,           765,           766,           767,
       'predictions',        'path'],
      dtype='object', length=770)

In [23]:
df_text_test_embeddings.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '44992', '44993', '44994', '44995', '44996', '44997', '44998', '44999',
       'imageid', 'productid'],
      dtype='object', length=45001)

In [24]:
embeddings_df['Nom image'] = embeddings_df['path'].str.extract(r'.*\\(image_.*)')

# Split the 'Nom image' column to extract 'imageid' and 'productid'
split_columns = embeddings_df['Nom image'].str.split('_', expand=True)

# Create 'imageid' and 'productid' columns in embeddings_df
embeddings_df['imageid'] = split_columns[1]
embeddings_df['productid'] = split_columns[3].str.replace('.jpg', '')

embeddings_df.drop(columns=['path'], inplace=True)

In [25]:
embeddings_df.columns

Index([            0,             1,             2,             3,
                   4,             5,             6,             7,
                   8,             9,
       ...
                 762,           763,           764,           765,
                 766,           767, 'predictions',   'Nom image',
           'imageid',   'productid'],
      dtype='object', length=772)

In [27]:
test_image_predictions = embeddings_df[['imageid', 'productid', 'Nom image', 'predictions']]
test_image_predictions.to_csv("./final_image_test_predictions.csv") 


In [28]:
df_text_test_embeddings.columns

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '44992', '44993', '44994', '44995', '44996', '44997', '44998', '44999',
       'imageid', 'productid'],
      dtype='object', length=45001)

In [29]:
df_image_test_embeddings = embeddings_df.drop(columns=['predictions', 'Nom image'])


In [30]:
df_image_test_embeddings.columns

Index([          0,           1,           2,           3,           4,
                 5,           6,           7,           8,           9,
       ...
               760,         761,         762,         763,         764,
               765,         766,         767,   'imageid', 'productid'],
      dtype='object', length=770)

In [31]:
df_image_test_embeddings.to_csv("./final_image_test_predictions_clean.csv") 

Il faut préparer le même dataset d'embeddings pour les données d'entrainement. Mais pour celà il faut déjà préparer les images

In [32]:
df_classes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   imageid      84916 non-null  int64 
 1   productid    84916 non-null  int64 
 2   prdtypecode  84916 non-null  int64 
 3   Nom image    84916 non-null  object
 4   lien         84916 non-null  object
dtypes: int64(3), object(2)
memory usage: 5.9+ MB


In [33]:
df_classes_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16984 entries, 0 to 16983
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   imageid      16984 non-null  int64 
 1   productid    16984 non-null  int64 
 2   prdtypecode  16984 non-null  int64 
 3   Nom image    16984 non-null  object
 4   lien         16984 non-null  object
 5   lien_test    16984 non-null  object
dtypes: int64(3), object(3)
memory usage: 796.2+ KB


In [34]:
# Perform a left merge on df_classes with df_classes_test using 'imageid' to find common entries
merged_df = df_classes.merge(df_classes_test[['imageid']], on='imageid', how='left', indicator=True)

# Filter rows where 'imageid' does not exist in df_classes_test
result_df = merged_df[merged_df['_merge'] == 'left_only']

# Drop the indicator column as it's no longer needed
result_df = result_df.drop(columns=['_merge'])

# Create the new DataFrame with rows from df_classes excluding those found in df_classes_test
df_exclusive_classes = result_df.copy()


In [35]:
df_exclusive_classes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67932 entries, 0 to 84914
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   imageid      67932 non-null  int64 
 1   productid    67932 non-null  int64 
 2   prdtypecode  67932 non-null  int64 
 3   Nom image    67932 non-null  object
 4   lien         67932 non-null  object
dtypes: int64(3), object(2)
memory usage: 3.1+ MB


### Preparation des images

In [37]:
import shutil
import os

# On ajuste le lien au nouveau lien pour les données de test
df_exclusive_classes['lien_train_final'] = df_exclusive_classes['lien'].str.replace('images_train_upscalled', 'images_train_upscalled_final')

# On créer les dossiers et fichiers pour le test !!!! (si déjà fait pas besoin de re-run ce code)
def copy_images(row):
    os.makedirs(os.path.dirname(row['lien_train_final']), exist_ok=True)
    shutil.copy(row['lien'], row['lien_train_final'])

# on applique la fonction pour chaque ligne du dataframe
df_exclusive_classes.apply(copy_images, axis=1)
df_exclusive_classes.to_csv("./final_image_train_dataset.csv") 

In [38]:
df_exclusive_classes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67932 entries, 0 to 84914
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   imageid           67932 non-null  int64 
 1   productid         67932 non-null  int64 
 2   prdtypecode       67932 non-null  int64 
 3   Nom image         67932 non-null  object
 4   lien              67932 non-null  object
 5   lien_train_final  67932 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.6+ MB


In [39]:
dataset_path = "C:/Users/tgp/Documents/kaggle/Rakuten/Github_final/4_Prediction_catégories/datasets/images_train_upscalled_final"
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
test_dataset = datasets.ImageFolder(dataset_path, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [40]:
embeddings, predictions, paths = get_embeddings_and_predictions(model, test_loader)

embeddings_df_train = pd.DataFrame(embeddings)
embeddings_df_train['predictions'] = predictions
embeddings_df_train['path'] = paths

embeddings_df_train.info()
embeddings_df_train.to_csv("final_image_train_embeddings.csv")

: 

# END

###

###

###

###

###

###

###

###

###