#Multi-Modal Transformer with Flask or Streamlit Application

#Load Project Dataset

In [6]:
import torch
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from PIL import Image
import pandas as pd
import os

# Load dataset
df = pd.read_csv("/content/multi_modal_dataset.csv")

# Define actual column names
TEXT_COLUMN = "Description"
IMAGE_COLUMN = "Image_Path"
NUMERICAL_COLUMNS = ["Price", "Rating", "Stock"]
LABEL_COLUMN = "Category"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(text):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Image preprocessing
def preprocess_image(image_path):
    if not os.path.exists(image_path):
        print(f"Warning: Image not found at {image_path}, using placeholder.")
        return torch.zeros((1, 3, 224, 224))  # Return a blank tensor if image is missing

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)

# Example usage
text_tokens = tokenize_text(df[TEXT_COLUMN].iloc[0])
image_tensor = preprocess_image(df[IMAGE_COLUMN].iloc[0])

print("Image Shape:", image_tensor.shape, "Text Tokens:", text_tokens)


Image Shape: torch.Size([1, 3, 224, 224]) Text Tokens: {'input_ids': tensor([[  101,  2190, 23836,  2075,  2449,  5656,  2338,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


Select and Fine-Tune Relevant Models

#Save and Verify the Model

In [8]:
import torch
import os
# ... (Other imports and model definition)

# Define model save path
MODEL_PATH = "multi_modal_model.pth"

# Create an instance of the model (if not already created)
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="google/vit-base-patch16-224-in21k",  # Or "resnet18"
    numerical_input_size=len(NUMERICAL_COLUMNS),
    output_classes=len(df[LABEL_COLUMN].unique())
)

# 1. Saving the Model
# --------------------
# torch.save() is used to save the model's state_dict (learned parameters).
# The state_dict is a dictionary containing all the model's trainable parameters.
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved successfully at {MODEL_PATH}")

# 2. Verifying Model Existence
# -----------------------------
# os.path.exists() checks if the saved model file exists at the specified path.
if os.path.exists(MODEL_PATH):
    print("Model file found. Ready for deployment!")
else:
    print("Error: Model file is missing! Train and save it again.")

# 3. Loading and Verifying Model Integrity
# ----------------------------------------
# Create a new instance of the same model architecture.
loaded_model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="google/vit-base-patch16-224-in21k",  # Or "resnet18"
    numerical_input_size=len(NUMERICAL_COLUMNS),
    output_classes=len(df[LABEL_COLUMN].unique())
)

# Load the saved state_dict into the new model instance.
# map_location=torch.device("cpu") ensures it loads on CPU if needed.
loaded_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu")))
loaded_model.eval()  # Set the model to evaluation mode

print("Model loaded and verified successfully!")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Model saved successfully at multi_modal_model.pth
Model file found. Ready for deployment!


  loaded_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu")))


Model loaded and verified successfully!


#Choose Flask API or Streamlit UI

In [9]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[

Running Streamlit

In [10]:
import streamlit as st
import torch
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch.nn as nn
import torchvision.models as models


# Define the MultiModalModel class here
class MultiModalModel(nn.Module):
    def __init__(self, text_model_name, image_model_name, numerical_input_size, output_classes):
        super().__init__()

        # Text Model (BERT)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)

        # Image Model (Vision Transformer - ViT or ResNet)
        if image_model_name == "resnet18":
            self.image_encoder = models.resnet18(pretrained=True)
            # Modify the final fully connected layer for your desired output classes
            num_ftrs = self.image_encoder.fc.in_features
            self.image_encoder.fc = nn.Linear(num_ftrs, 768)  # Adjust output to 768
        else:
            self.image_encoder = AutoModel.from_pretrained(image_model_name)

        # Numerical Feature Processing (MLP)
        self.fc_numeric = nn.Sequential(
            nn.Linear(numerical_input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        # Final Classifier
        self.fc_combined = nn.Linear(768 + 768 + 64, output_classes)  # Adjusted dimensions

    def forward(self, text_tokens, image_tensor, numerical_data):
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]

        # Process image features based on model type
        if isinstance(self.image_encoder, models.ResNet):
            image_features = self.image_encoder(image_tensor)
        else:
            image_features = self.image_encoder(**{"pixel_values": image_tensor}).last_hidden_state[:, 0, :]

        numeric_features = self.fc_numeric(numerical_data)

        combined = torch.cat((text_features, image_features, numeric_features), dim=1)
        return self.fc_combined(combined)



# Define necessary columns for numerical and label columns
NUMERICAL_COLUMNS = ["heart_rate", "temperature"]  # Example columns, adjust as needed
LABEL_COLUMN = "diagnosis"  # Adjust the label column name based on your dataset

# Load trained model
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="resnet18",
    numerical_input_size=len(NUMERICAL_COLUMNS),
    output_classes=2 # Assuming binary classification for this example
)

# ... (rest of your Streamlit code)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 53.0MB/s]


In [11]:
!pip install streamlit
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [12]:
!ngrok authtoken 2sgt1hYsOX5gXo4vAAlaI3Il9Gf_4F2y4zN75dK1aGoaA11yZ


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
!pip install sacremoses transformers torch torchvision


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

In [16]:
st.title("🛍️ E-Commerce Product Classification")

# User Inputs
review_text = st.text_area("📄 Enter Product Description")
price = st.number_input("💲 Product Price ($)", min_value=0.0, value=50.0)
rating = st.slider("⭐ Product Rating", min_value=1, max_value=5, value=3)
stock = st.number_input("📦 Stock Availability", min_value=0, value=10)
image_file = st.file_uploader("🖼️ Upload Product Image", type=["jpg", "png"])

if st.button("🔍 Predict"):
    if review_text and image_file:
        # Save and Process Image
        image_path = "uploaded_product.jpg"
        with open(image_path, "wb") as f:
            f.write(image_file.read())

        text_tokens = tokenizer(review_text, return_tensors="pt", padding=True, truncation=True)
        image_tensor = preprocess_image(image_path)
        numerical_data = torch.tensor([[price, rating, stock]], dtype=torch.float32)

        with torch.no_grad():
            prediction = model(text_tokens, image_tensor, numerical_data)

        predicted_category = df[LABEL_COLUMN].unique()[torch.argmax(prediction, dim=1).item()]
        st.success(f"🎯 Predicted Category: **{predicted_category}**")
    else:
        st.warning("⚠️ Please enter a product description and upload an image.")

2025-02-07 02:51:55.999 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-02-07 02:51:56.016 Session state does not function when running a script without `streamlit run`


In [19]:
!npm install -g localtunnel


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K
added 22 packages in 4s
[1G[0K⠼[1G[0K
[1G[0K⠼[1G[0K3 packages are looking for funding
[1G[0K⠼[1G[0K  run `npm fund` for details
[1G[0K⠼[1G[0K

In [23]:
!pkill streamlit
!streamlit run streamlit_app.py &
!npx localtunnel --port 8501


Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: streamlit_app.py
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://bumpy-owls-change.loca.lt
/tools/node/lib/node_modules/localtunnel/bin/lt.js:81
    throw err;
    ^

Error: connection refused: localtunnel.me:34051 (check your firewall settings)
    at Socket.<anonymous> (/tools/node/lib/node_modules/[4mlocaltunnel[24m/lib/TunnelCluster.js:52:11)
[90m    at Socket.emit (node:events:517:28)[39m
[90m    at emitErrorNT (node:internal/streams/destroy:151:8)[39m
[90m    at emitErrorCloseNT (node:internal/streams/destroy:116:3)[39m
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:82:21)[39m

Node.js v18.20.5
[1G[0K⠙[1G[0K

In [21]:
import os
import threading

def run_streamlit():
    os.system("streamlit run streamlit_app.py &")

def expose_public_url():
    os.system("npx localtunnel --port 8501")

# Run Streamlit and LocalTunnel in Background
threading.Thread(target=run_streamlit).start()
threading.Thread(target=expose_public_url).start()


In [17]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.157.165:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


#Using Flask API

In [25]:
!pip install flask torch torchvision transformers pillow




In [26]:
!ngrok authtoken 2sgt1hYsOX5gXo4vAAlaI3Il9Gf_4F2y4zN75dK1aGoaA11yZ


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [28]:
import torch
from flask import Flask, request, jsonify
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel
import os
from pyngrok import ngrok

app = Flask(__name__)

# Expose Flask API with ngrok
public_url = ngrok.connect(5000)
print("🚀 Flask App Public URL:", public_url)

# Define Model Parameters
TEXT_COLUMN = "Description"
NUMERICAL_COLUMNS = ["Price", "Rating", "Stock"]
LABELS = ["Electronics", "Clothing", "Home Appliances"]  # Update based on actual categories

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Image Preprocessing
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    if not os.path.exists(image_path):
        return torch.zeros((1, 3, 224, 224))  # Placeholder tensor for missing images

    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)

# Model Definition
class MultiModalModel(torch.nn.Module):
    def __init__(self, text_model_name, image_model_name, numerical_input_size, output_classes):
        super(MultiModalModel, self).__init__()
        self.text_model = AutoModel.from_pretrained(text_model_name)
        self.image_model = torch.hub.load('pytorch/vision', 'resnet18', weights="ResNet18_Weights.DEFAULT")
        self.image_model.fc = torch.nn.Linear(self.image_model.fc.in_features, 256)
        self.numerical_fc = torch.nn.Linear(numerical_input_size, 256)
        self.final_fc = torch.nn.Linear(256 * 3, output_classes)

    def forward(self, text_tokens, image_tensor, numerical_data):
        text_features = self.text_model(**text_tokens).last_hidden_state[:, 0, :]
        image_features = self.image_model(image_tensor)
        numerical_features = self.numerical_fc(numerical_data)
        combined = torch.cat((text_features, image_features, numerical_features), dim=1)
        return self.final_fc(combined)

# Load trained model
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="resnet18",
    numerical_input_size=len(NUMERICAL_COLUMNS),
    output_classes=len(LABELS)
)

# Load model state dictionary
model.load_state_dict(torch.load("multi_modal_model.pth", map_location="cpu"), strict=False)
model.eval()

# API Route for Prediction
@app.route("/predict", methods=["POST"])
def predict():
    try:
        data = request.form
        text = data.get("description", "")
        price = float(data.get("price", 0))
        rating = float(data.get("rating", 0))
        stock = float(data.get("stock", 0))

        image_file = request.files.get("image")
        if image_file:
            image_path = "uploaded_product.jpg"
            image_file.save(image_path)
            image_tensor = preprocess_image(image_path)
        else:
            return jsonify({"error": "No image uploaded"}), 400

        # Tokenize text
        text_tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        numerical_data = torch.tensor([[price, rating, stock]], dtype=torch.float32)

        # Make prediction
        with torch.no_grad():
            prediction = model(text_tokens, image_tensor, numerical_data)

        predicted_category = LABELS[torch.argmax(prediction, dim=1).item()]

        return jsonify({"predicted_category": predicted_category})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    app.run(port=5000)


🚀 Flask App Public URL: NgrokTunnel: "https://da12-34-125-157-165.ngrok-free.app" -> "http://localhost:5000"


Using cache found in /root/.cache/torch/hub/pytorch_vision_main
  model.load_state_dict(torch.load("multi_modal_model.pth", map_location="cpu"), strict=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [07/Feb/2025 03:19:34] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [07/Feb/2025 03:19:34] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


In [None]:
python app.py
