In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files, drive
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# 1. Mount Drive and setup Path
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/MyDrive/Flickr_Project/'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# 2. Dataset Download
if not os.path.exists('kaggle.json'):
    files.upload() # Upload your kaggle.json here
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d adityajn105/flickr8k
!unzip -q flickr8k.zip -d flickr_data

# 3. Process Captions
df = pd.read_csv('flickr_data/captions.txt')
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char.isalpha() or char.isspace()])
    text = " ".join(text.split())
    return f"startseq {text} endseq"

df['caption'] = df['caption'].apply(clean_text)

# 4. Create Tokenizer (The Dictionary)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['caption'].tolist())
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in df['caption'].tolist())

# 5. Save Tokenizer immediately
with open(SAVE_PATH + 'tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print(f"‚úÖ Dataset Ready! Vocab Size: {vocab_size}, Max Length: {max_length}")

Mounted at /content/drive


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
Downloading flickr8k.zip to /content
 99% 1.03G/1.04G [00:12<00:00, 256MB/s]
100% 1.04G/1.04G [00:12<00:00, 86.6MB/s]
‚úÖ Dataset Ready! Vocab Size: 8781, Max Length: 37


In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Add, GlobalAveragePooling2D
from tensorflow.keras.applications import MobileNetV2

# ENCODER (Image Features)
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False
image_input = Input(shape=(224, 224, 3))
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
image_features = Dense(256, activation='relu')(x)

# DECODER (Sequence Processing)
caption_input = Input(shape=(max_length,))
y = Embedding(vocab_size, 256, mask_zero=False)(caption_input)
y = LSTM(256)(y)

# MERGE
decoder = Add()([image_features, y])
decoder = Dense(256, activation='relu')(decoder)
output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("‚úÖ Model Architecture Built & Compiled.")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 0us/step
‚úÖ Model Architecture Built & Compiled.


In [3]:
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array

class FlickrGenerator(Sequence):
    def __init__(self, df, tokenizer, max_length, vocab_size, batch_size=32):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.img_dir = 'flickr_data/Images'

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size : (index + 1) * self.batch_size]
        X_img, X_txt, y_label = [], [], []
        for _, row in batch.iterrows():
            img_path = os.path.join(self.img_dir, row['image'])
            img = load_img(img_path, target_size=(224, 224))
            img = img_to_array(img) / 255.0
            seq = self.tokenizer.texts_to_sequences([row['caption']])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=self.max_length, padding='post')[0]
                out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                X_img.append(img)
                X_txt.append(in_seq)
                y_label.append(out_seq)
        return (np.array(X_img), np.array(X_txt)), np.array(y_label)

# 1. Initialize generator
generator = FlickrGenerator(df, tokenizer, max_length, vocab_size, batch_size=32)

# 2. Train (20 Epochs for good results)
print("‚è≥ Training starting...")
model.fit(generator, epochs=20, verbose=1)

# 3. Save to Drive Folder
model.save(SAVE_PATH + 'flickr8k_model.keras')
print(f"‚úÖ Model and Tokenizer are now safely in: {SAVE_PATH}")

‚è≥ Training starting...


  self._warn_if_super_not_called()


Epoch 1/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1084s[0m 849ms/step - loss: 5.4522
Epoch 2/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m775s[0m 613ms/step - loss: 4.9533
Epoch 3/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m774s[0m 612ms/step - loss: 4.7997
Epoch 4/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m773s[0m 611ms/step - loss: 4.7033
Epoch 5/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m774s[0m 612ms/step - loss: 4.5975
Epoch 6/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m773s[0m 611ms/step - loss: 4.4926
Epoch 7/20
[1m1264/1264[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m773s

In [None]:
!pip install flask-ngrok flask-cors pyngrok
import io
import os
import pickle
import numpy as np
import tensorflow as tf
from PIL import Image  # <--- THIS WAS THE MISSING PIECE
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Use the SAVE_PATH defined in Cell 1
model_path = SAVE_PATH + 'flickr8k_model.keras'
tokenizer_path = SAVE_PATH + 'tokenizer.pkl'

print("‚è≥ Loading Model & Tokenizer from Drive...")
model = tf.keras.models.load_model(model_path)
with open(tokenizer_path, 'rb') as f:
    tokenizer = pickle.load(f)

app = Flask(__name__)
CORS(app)

def generate_caption(img_array):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')

        yhat = model.predict([img_array, sequence], verbose=0)
        yhat = np.argmax(yhat)

        word = tokenizer.index_word.get(yhat)

        if word is None or word == 'endseq':
            break

        # Optimization: If the model starts repeating a word, stop it
        if word in in_text.split()[-2:]:
            break

        in_text += ' ' + word

    return in_text.replace('startseq', '').strip()

@app.route('/predict', methods=['POST'])
def predict():
    try:
        file = request.files['image']
        img = Image.open(io.BytesIO(file.read())).convert('RGB').resize((224, 224))

        # Preprocessing
        img_array = img_to_array(img) / 255.0
        img_array = np.expand_dims(img_array, axis=0)

        # 1. Generate the raw caption from the model
        caption = generate_caption(img_array)

        # --- SMART PRIORITY LOGIC ---
        caption_lower = caption.lower()
        detected_action = "General Pose"

        # 1. CHECK ACTIONS FIRST (High Priority)
        if any(w in caption_lower for w in ["run", "running", "sprinting", "jogging"]):
            detected_action = "Running"
        elif any(w in caption_lower for w in ["jump", "jumping", "leap", "leaping", "fly", "midair"]):
            detected_action = "Jumping"
        elif any(w in caption_lower for w in ["swim", "swimming", "diving"]):
            detected_action = "Swimming"
        elif any(w in caption_lower for w in ["ride", "riding", "bike", "bicycle", "motorcycle"]):
            detected_action = "Cycling/Riding"

        # 2. CHECK ENVIRONMENT SECOND (Low Priority - only if no action was found)
        elif any(w in caption_lower for w in ["water", "pool", "ocean", "river", "lake", "beach"]):
            detected_action = "Swimming"
        elif any(w in caption_lower for w in ["grass", "field", "park", "track"]):
            detected_action = "Running"
        elif any(w in caption_lower for w in ["mountain", "rock", "climb"]):
            detected_action = "Climbing"

        else:
            detected_action = "General Pose"

        # 3. Return the result to your website
        return jsonify({
            "status": "success",
            "action": detected_action,
            "annotation": caption.capitalize()
        })

    except Exception as e:
        return jsonify({"status": "error", "message": str(e)})

NGROK_TOKEN = "37hiJsed6u7wmAhFg5FzXLSGilx_jhuZvhqYsiZac92SpGH8"
!ngrok config add-authtoken {NGROK_TOKEN}
public_url = ngrok.connect(5000).public_url
print(f"üöÄ PUBLIC URL: {public_url}/predict")

if __name__ == '__main__':
    app.run()

‚è≥ Loading Model & Tokenizer from Drive...
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
üöÄ PUBLIC URL: https://truculently-unnarrow-rigoberto.ngrok-free.dev/predict
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [03/Jan/2026 11:31:30] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Jan/2026 11:31:39] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Jan/2026 11:31:52] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Jan/2026 11:34:08] "POST /predict HTTP/1.1" 200 -
