In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# 1. Upload kaggle.json
files.upload()

# 2. Setup Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 3. Download and Unzip Flickr8k
!kaggle datasets download -d adityajn105/flickr8k
!unzip -q flickr8k.zip -d flickr_data

# 4. Process Captions
df = pd.read_csv('flickr_data/captions.txt')

def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char.isalpha() or char.isspace()])
    text = " ".join(text.split())
    return f"startseq {text} endseq"

df['caption'] = df['caption'].apply(clean_text)

# 5. Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['caption'].tolist())
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in df['caption'].tolist())

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print(f"Dataset Ready! Vocab Size: {vocab_size}, Max Length: {max_length}")

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
Downloading flickr8k.zip to /content
 99% 1.03G/1.04G [00:08<00:00, 334MB/s]
100% 1.04G/1.04G [00:08<00:00, 132MB/s]
Dataset Ready! Vocab Size: 8781, Max Length: 37


In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add, GlobalAveragePooling2D
from tensorflow.keras.applications import MobileNetV2

# ENCODER (Image Features)
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False
image_input = Input(shape=(224, 224, 3))
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
image_features = Dense(256, activation='relu')(x)

# DECODER (Sequence Processing)
caption_input = Input(shape=(max_length,))
y = Embedding(vocab_size, 256, mask_zero=False)(caption_input) # mask_zero=False for GPU compatibility
y = LSTM(256)(y)

# MERGE
decoder = Add()([image_features, y])
decoder = Dense(256, activation='relu')(decoder)
output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [10]:
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array

class FlickrGenerator(Sequence):
    def __init__(self, df, tokenizer, max_length, vocab_size, batch_size=32):
        super().__init__() # This fixes the first UserWarning you saw
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.img_dir = 'flickr_data/Images'

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size : (index + 1) * self.batch_size]
        X_img, X_txt, y_label = [], [], []

        for _, row in batch.iterrows():
            img_path = os.path.join(self.img_dir, row['image'])
            img = load_img(img_path, target_size=(224, 224))
            img = img_to_array(img) / 255.0

            seq = self.tokenizer.texts_to_sequences([row['caption']])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=self.max_length, padding='post')[0]
                out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                X_img.append(img)
                X_txt.append(in_seq)
                y_label.append(out_seq)

        # IMPORTANT: Return inputs as a tuple (input1, input2) for Keras 3
        return (np.array(X_img), np.array(X_txt)), np.array(y_label)

# 1. Use the WHOLE dataset now
train_df = df

# 2. Re-initialize the generator with the full data
generator = FlickrGenerator(train_df, tokenizer, max_length, vocab_size, batch_size=32)

# 3. Train for more epochs (20 is a good target)
model.fit(generator, epochs=20, verbose=1)

# 4. Save the improved model
model.save('flickr8k_model.keras')

Epoch 1/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1032s[0m 815ms/step - loss: 5.2567
Epoch 2/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 615ms/step - loss: 4.9106
Epoch 3/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 615ms/step - loss: 4.7760
Epoch 4/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 615ms/step - loss: 4.6636
Epoch 5/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 615ms/step - loss: 4.5644
Epoch 6/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 615ms/step - loss: 4.4756
Epoch 7/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 615ms/step - loss: 4.3821
Epoch 8/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 616ms/step - loss: 4.3142
Epoch 9/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 616ms/step - loss: 4.2493
Epoch 10/20
[1m1264/1264[0m [32m━

In [11]:
!pip install flask-ngrok flask-cors pyngrok
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import io
from PIL import Image

# 1. Get your Token from https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_TOKEN = "the ngrok-authtoken"
!ngrok config add-authtoken {NGROK_TOKEN}

app = Flask(__name__)
CORS(app)

def generate_caption(img_array):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([img_array, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').strip()

@app.route('/predict', methods=['POST'])
def predict():
    file = request.files['image']
    img = Image.open(io.BytesIO(file.read())).convert('RGB').resize((224, 224))
    img_array = np.expand_dims(np.array(img) / 255.0, axis=0)

    caption = generate_caption(img_array)

    # ACTION RECOGNITION LOGIC:
    # Since we use static images, we extract the "Action Verb" from the caption.
    action_verbs = ["running", "jumping", "riding", "playing", "sitting", "swimming"]
    detected_action = "General Pose"
    for verb in action_verbs:
        if verb in caption.lower():
            detected_action = verb.capitalize()
            break

    return jsonify({
        "status": "success",
        "action": detected_action,
        "annotation": caption.capitalize()
    })

# Start the tunnel
public_url = ngrok.connect(5000)
print(f"\nCOPY THIS URL TO YOUR VS CODE: {public_url.public_url}")
app.run()

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml

COPY THIS URL TO YOUR VS CODE: https://truculently-unnarrow-rigoberto.ngrok-free.dev
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [28/Dec/2025 18:07:48] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [28/Dec/2025 18:07:57] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [28/Dec/2025 18:08:29] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [28/Dec/2025 18:08:58] "POST /predict HTTP/1.1" 200 -
