In [29]:
%env KERAS_BACKEND=torch
import keras 

env: KERAS_BACKEND=torch


In [30]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm, trange
from PIL import Image
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences

In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import SnowballStemmer

In [32]:
nltk.download('stopwords')
snow_stemmer = SnowballStemmer(language='english')

[nltk_data] Downloading package stopwords to /home/radu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading images...

In [33]:
folders = ["train_images"]
image_data = []
size_not = []
grayscale = []
for folder in folders:
    files = [f for f in os.listdir(folder) if f.endswith('.jpg')]
    for file_name in tqdm(files, desc = f"Loading images from folder {folder}"):
        file_path = os.path.join(folder, file_name)

        img = Image.open(file_path) #open the image
        
        if img.mode != "RGB":
            img = img.convert("RGB")
            
        img_arr = np.array(img) / 255.0
        image_data.append(img_arr)
            


Loading images from folder train_images: 100%|██████████| 9463/9463 [00:02<00:00, 4148.00it/s]


In [34]:
image_data = np.array(image_data)
print("Loaded images shape:", image_data.shape)

Loaded images shape: (9463, 100, 100, 3)


In [35]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout

def build_model():
    ### 3.1. create the model
    model = Sequential()
    
    ### 3.2. add the layers
    
    # L1: add a CONV layer with 32 filters, kernel size 3, padding same, activation relu 
    model.add(Conv2D(32, kernel_size=3, padding='same', activation='relu', input_shape=(100, 100, 3)))
    
    # L2: add a CONV layer with 32 filters, kernel size 3, activation relu
    model.add(Conv2D(32, kernel_size=3, activation='relu'))
    
    # L3: add a Max Pooling layer, pool size 2x2
    model.add(MaxPooling2D(pool_size=2))
    
    # L4: add a Dropout layer, drop 1/4 of the neurons
    model.add(Dropout(0.25))
    
    # L5: add a CONV layer with 64 filters, kernel size 3, padding same, activation relu
    model.add(Conv2D(64, kernel_size=3, padding='same', activation='relu'))
    
    # L6: add a CONV layer with 64 filters, kernel size 3, activation relu
    model.add(Conv2D(64, kernel_size=3, activation='relu'))
    
    # L7: add a Max Pooling layer with pool size 2x2
    model.add(MaxPooling2D(pool_size=2))
    
    # L8: add a Dropout layer; drop 1/4 of the neurons
    model.add(Dropout(0.25))
    
    # L9: add a Flatten layer
    model.add(Flatten())
    
    # L10: add a Dense layer with 512 neurons and activation relu
    model.add(Dense(1024, activation='relu'))
    
    

    return model

model = build_model()
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [36]:
feature_extractor = Sequential(model.layers)  # Exclude last layer
features = feature_extractor.predict(image_data)
print("Extracted features shape:", features.shape)
np.save("extracted_features.npy", features)


[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step
Extracted features shape: (9463, 1024)


In [37]:
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')
test = pd.read_csv('test.csv')

train_images_path = './train_images'
val_images_path = './val_images'
test_images_path = './test_images'

In [38]:
print(len(train), len(val), len(test))
print(train.head())

10000 3000 2000
                                     id  \
0  417812c5-0ce4-499d-b97d-4d28827239bc   
1  5ac91fa3-55f2-4cb3-8c8f-ad84f78e6b36   
2  d2705b90-8347-4cab-a7a6-654540d9a489   
3  a3b33fe7-3085-4433-9c18-8814803891b4   
4  1514b0e4-0665-45bc-ab32-52fce326cc29   

                                             caption  image_id  label  
0  Wet elephants shake water onto people bathing ...    394330      0  
1       Two men holding tennis racquets on the court    130849      0  
2  A bird on a tree limb with mountains in the ba...    514790      0  
3  A kitchen and dining room are featured along w...    182096      0  
4     A fruit stand has various fruits on the table.     68788      1  


In [39]:
print(train['caption'][0])

Wet elephants shake water onto people bathing them.


In [40]:
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence):
    # Remove punctuation
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenize
    tokens = word_tokenize(sentence)
    # Remove stop words
    tokens = [snow_stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words]
    # Join back into a string
    return ' '.join(tokens)     

In [41]:
train['processed_caption'] = train['caption'].apply(preprocess_sentence)
val['processed_caption'] = val['caption'].apply(preprocess_sentence)

In [42]:
print(train['caption'][0])
print(train['processed_caption'][0])

Wet elephants shake water onto people bathing them.
wet eleph shake water onto peopl bath


In [43]:
#un vocabular care sa mapeze un id
mapped_ids = {}
k = 0
for i in range(len(train)):
    for word in train['processed_caption'][i].split():
        if word not in mapped_ids:
            mapped_ids[word] = k
            k += 1
            
        
j = 0
for i in range(len(val)):
    for word in val['processed_caption'][i].split():
        if word not in mapped_ids:
            mapped_ids[word] = k
            j += 1

In [44]:
print(mapped_ids)

{'wet': 0, 'eleph': 1, 'shake': 2, 'water': 3, 'onto': 4, 'peopl': 5, 'bath': 6, 'two': 7, 'men': 8, 'hold': 9, 'tenni': 10, 'racquet': 11, 'court': 12, 'bird': 13, 'tree': 14, 'limb': 15, 'mountain': 16, 'background': 17, 'kitchen': 18, 'dine': 19, 'room': 20, 'featur': 21, 'along': 22, 'larg': 23, 'tabl': 24, 'applianc': 25, 'chandeli': 26, 'fruit': 27, 'stand': 28, 'various': 29, 'basebal': 30, 'athlet': 31, 'get': 32, 'readi': 33, 'swing': 34, 'bat': 35, 'player': 36, 'game': 37, 'sever': 38, 'front': 39, 'tv': 40, 'monitor': 41, 'play': 42, 'wii': 43, 'babi': 44, 'lamb': 45, 'feed': 46, 'open': 47, 'field': 48, 'mother': 49, 'peer': 50, 'forward': 51, 'man': 52, 'surfboard': 53, 'wade': 54, 'ocean': 55, 'woman': 56, 'sit': 57, 'beach': 58, 'watch': 59, 'young': 60, 'girl': 61, 'helmet': 62, 'ride': 63, 'skateboard': 64, 'white': 65, 'build': 66, 'show': 67, 'glass': 68, 'window': 69, 'deep': 70, 'fryer': 71, 'microwav': 72, 'view': 73, 'long': 74, 'electr': 75, 'freight': 76, 'tra

In [45]:
#cum fac embedding in keras
#vezi si tu
#

In [46]:
def assign_id(sentence):
    return [mapped_ids[word] for word in sentence.split()]

In [47]:
train['caption_ids'] = train['processed_caption'].apply(assign_id)
val['caption_ids'] = val['processed_caption'].apply(assign_id)

In [48]:
print(train['caption_ids'][99])

[7, 316, 133, 36, 96, 317, 133]


In [49]:
max_length = max([len(caption.split()) for caption in train['processed_caption']])
print(max_length)
vacabulary_size = len(mapped_ids)
print(vacabulary_size)

31
2929


In [50]:
train_padded = pad_sequences(train['caption_ids'], maxlen=max_length, padding='post')
val_padded = pad_sequences(val['caption_ids'], maxlen=max_length, padding='post')

x_train = np.array(train_padded)
x_val = np.array(val_padded)

print(x_train.shape)

(10000, 31)


### Building the model ?

In [51]:
embedding_dim = 256
lstm_units = 256

model = Sequential()

#embedding layer
model.add(Embedding(vacabulary_size, embedding_dim, input_length=max_length))

#lstm layer
model.add(LSTM(lstm_units,activation = 'tanh' ,return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(lstm_units,activation = 'tanh' ,return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=1024, activation='relu'))

model.build(input_shape=(None, max_length))
model.summary()



In [52]:
# Forward pass through the model to get extracted features
features_lstm = model.predict(x_train)

print("Extracted Features Shape:", features_lstm.shape)  # Should be (10000, 512)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step
Extracted Features Shape: (10000, 1024)


In [53]:
# Save extracted features to a .npy file
np.save('train_lstm_features.npy', features_lstm)

# Repeat for validation set
val_features = model.predict(x_val)
np.save('val_lstm_features.npy', val_features)

print("Validation Features Shape:", val_features.shape)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step
Validation Features Shape: (3000, 1024)


In [54]:
train_ids = train['id'].values
train_labels = train['label'].values

val_ids = val['id'].values
val_labels = val['label'].values



In [55]:
image_features = np.load('extracted_features.npy')
lstm_features = np.load('train_lstm_features.npy')

print("Image Features Shape:", image_features.shape)
print("LSTM Features Shape:", lstm_features.shape)

Image Features Shape: (9463, 1024)
LSTM Features Shape: (10000, 1024)


In [56]:
combined_features = np.concatenate((image_features, lstm_features), axis=1)
print("Combined Features Shape:", combined_features.shape)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 9463 and the array at index 1 has size 10000

In [None]:
# Combined features and labels
x_train = combined_features
y_train = train_labels  # Labels are 0 or 1

print("Training Features Shape:", x_train.shape)
print("Training Labels Shape:", y_train.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Model
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(x_train.shape[1],)))  # Input: concatenated features
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification (0/1)

# Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
