In [1]:
!pip install tensorflow keras opencv-python pillow numpy matplotlib scikit-learn


Collecting tensorflow
  Using cached tensorflow-2.20.0-cp311-cp311-win_amd64.whl (331.8 MB)
Collecting keras
  Using cached keras-3.13.0-py3-none-any.whl (1.5 MB)
Installing collected packages: keras, tensorflow
Successfully installed keras-3.13.0 tensorflow-2.20.0



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


  if not hasattr(np, "object"):


In [2]:
DATASET_PATH = "Stanford40"   # change if needed

IMAGES_PATH = os.path.join(DATASET_PATH, "JPEGImages")
SPLITS_PATH = os.path.join(DATASET_PATH, "ImageSplits")


In [7]:
def load_split(file_name):
    with open(os.path.join(SPLITS_PATH, file_name)) as f:
        return [line.strip() for line in f.readlines()]

train_files = load_split("train.txt")
test_files = load_split("test.txt")

print("Training images:", len(train_files))
print("Testing images:", len(test_files))


Training images: 4000
Testing images: 5532


In [8]:
def get_label(filename):
    return filename.split("_")[0]

labels = [get_label(name) for name in train_files]

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

num_classes = len(label_encoder.classes_)
print("Total Classes:", num_classes)


Total Classes: 34


In [9]:
cnn_model = ResNet50(
    weights="imagenet",
    include_top=False,
    pooling="avg"
)


In [10]:
def extract_features(image_list):
    features = []
    
    for img_name in image_list:
        img_path = os.path.join(IMAGES_PATH, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = preprocess_input(img)
        img = np.expand_dims(img, axis=0)

        feature = cnn_model.predict(img, verbose=0)
        features.append(feature[0])

    return np.array(features)


In [11]:
X = extract_features(train_files)
y = to_categorical(encoded_labels, num_classes)

print("Feature Shape:", X.shape)
print("Label Shape:", y.shape)


Feature Shape: (4000, 2048)
Label Shape: (4000, 34)


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# LSTM expects 3D input
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))


In [13]:
lstm_model = Sequential([
    LSTM(256, input_shape=(1, X_train.shape[2]), return_sequences=False),
    Dropout(0.5),
    Dense(128, activation="relu"),
    Dense(num_classes, activation="softmax")
])

lstm_model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

lstm_model.summary()


  super().__init__(**kwargs)


In [15]:
history = lstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32
)


Epoch 1/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8500 - loss: 0.4935 - val_accuracy: 0.7200 - val_loss: 0.9865
Epoch 2/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8647 - loss: 0.4541 - val_accuracy: 0.7075 - val_loss: 1.0200
Epoch 3/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8687 - loss: 0.4046 - val_accuracy: 0.7225 - val_loss: 0.9574
Epoch 4/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8816 - loss: 0.3770 - val_accuracy: 0.7163 - val_loss: 0.9844
Epoch 5/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8803 - loss: 0.3719 - val_accuracy: 0.7000 - val_loss: 1.0581
Epoch 6/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8997 - loss: 0.3158 - val_accuracy: 0.7125 - val_loss: 0.9929
Epoch 7/100
[1m

In [16]:
cnn_model.save("cnn_feature_extractor.h5")
lstm_model.save("lstm_action_model.h5")

print("Models saved successfully")




Models saved successfully


In [17]:
def predict_action(image_name):
    img_path = os.path.join(IMAGES_PATH, image_name)
    img = cv2.imread(img_path)
    img = cv2.resize(img, (224, 224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)

    feature = cnn_model.predict(img, verbose=0)
    feature = feature.reshape((1, 1, feature.shape[1]))

    prediction = lstm_model.predict(feature)
    class_id = np.argmax(prediction)
    return label_encoder.inverse_transform([class_id])[0]


In [18]:
print(predict_action(train_files[0]))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
applauding


In [19]:
action_to_caption = {
    action: f"A person is {action.replace('_', ' ')}"
    for action in label_encoder.classes_
}

action_to_caption


{np.str_('applauding'): 'A person is applauding',
 np.str_('blowing'): 'A person is blowing',
 np.str_('brushing'): 'A person is brushing',
 np.str_('cleaning'): 'A person is cleaning',
 np.str_('climbing'): 'A person is climbing',
 np.str_('cooking'): 'A person is cooking',
 np.str_('cutting'): 'A person is cutting',
 np.str_('drinking'): 'A person is drinking',
 np.str_('feeding'): 'A person is feeding',
 np.str_('fishing'): 'A person is fishing',
 np.str_('fixing'): 'A person is fixing',
 np.str_('gardening'): 'A person is gardening',
 np.str_('holding'): 'A person is holding',
 np.str_('jumping'): 'A person is jumping',
 np.str_('looking'): 'A person is looking',
 np.str_('phoning'): 'A person is phoning',
 np.str_('playing'): 'A person is playing',
 np.str_('pouring'): 'A person is pouring',
 np.str_('pushing'): 'A person is pushing',
 np.str_('reading'): 'A person is reading',
 np.str_('riding'): 'A person is riding',
 np.str_('rowing'): 'A person is rowing',
 np.str_('running'):