In [2]:
import torch
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from PIL import Image
import pandas as pd
import os

# Load project dataset
df = pd.read_csv("/content/multimodal_dataset.csv")

# Define relevant columns based on the dataset
TEXT_COLUMN = "text_column"
IMAGE_COLUMN = "image_column"
NUMERICAL_COLUMNS = ["numerical_feature1", "numerical_feature2"]
LABEL_COLUMN = "label_column"

# Tokenize text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(text):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Preprocess images
def preprocess_image(image_filename):
    image_path = os.path.join("images", image_filename)  # Use the images folder
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)

# Example usage
text_tokens = tokenize_text(df[TEXT_COLUMN].iloc[0])
image_tensor = preprocess_image(df[IMAGE_COLUMN].iloc[0])

# Print outputs with each on a new line
print("Image Shape:")
print(image_tensor.shape)
print("\nText Tokens:")
print(text_tokens)


Image Shape:
torch.Size([1, 3, 224, 224])

Text Tokens:
{'input_ids': tensor([[  101,  5776,  4311, 10256,  9016,  1998, 19340,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [3]:
from transformers import AutoModel
import torch
import torch.nn as nn

class MultiModalModel(nn.Module):
    def __init__(self, text_model_name, image_model_name, numerical_input_size, output_classes):
        super().__init__()

        # Text Encoder (BERT-based model)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)

        # Image Encoder (ResNet18 model)
        self.image_encoder = torch.hub.load("pytorch/vision", image_model_name, pretrained=True)
        self.image_encoder.fc = nn.Identity()  # Removing final layer for feature extraction

        # Numerical feature processing
        self.fc_numeric = nn.Linear(numerical_input_size, 128)

        # Combined output classifier
        self.fc_combined = nn.Linear(512 + 768 + 128, output_classes)

    def forward(self, text_tokens, image_tensor, numerical_data):
        # Extract features from the text (first token in the sequence)
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]

        # Extract features from the image (ResNet18)
        image_features = self.image_encoder(image_tensor)

        # Process numerical features
        numeric_features = self.fc_numeric(numerical_data)

        # Combine all features
        combined = torch.cat((text_features, image_features, numeric_features), dim=1)

        # Final classification output
        return self.fc_combined(combined)

# Create the model instance with project-relevant choices
model = MultiModalModel(
    text_model_name="bert-base-uncased",  # Text model: BERT
    image_model_name="resnet18",  # Image model: ResNet18
    numerical_input_size=len(NUMERICAL_COLUMNS),  # Number of numerical features
    output_classes=len(df[LABEL_COLUMN].unique())  # Number of unique output classes
)

print("Model Ready:", model)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://github.com/pytorch/vision/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 97.3MB/s]


Model Ready: MultiModalModel(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [4]:
import os
import torch

# Assuming `model` is your trained multimodal model

# Save the trained model
torch.save(model.state_dict(), "multi_modal_model.pth")
print("Model saved successfully")

# Verify model file
if os.path.exists("multi_modal_model.pth"):
    print("Model file found")
else:
    print("Model file is missing. Please train and save it again.")


Model saved successfully
Model file found


In [5]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[

In [6]:
!pip install streamlit localtunnel transformers torch torchvision pandas


[31mERROR: Could not find a version that satisfies the requirement localtunnel (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for localtunnel[0m[31m
[0m

In [7]:
%%writefile streamlit_app.py
import streamlit as st
import torch
import pandas as pd
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel
import os

# Load Dataset
DATASET_PATH = "multimodal_dataset.csv"

if os.path.exists(DATASET_PATH):
    df = pd.read_csv(DATASET_PATH)
else:
    st.error(f"Dataset '{DATASET_PATH}' not found. Please upload it.")
    st.stop()

# Define relevant columns
TEXT_COLUMN = "text_column"
IMAGE_COLUMN = "image_column"
NUMERICAL_COLUMNS = ["numerical_feature1", "numerical_feature2"]
LABEL_COLUMN = "label_column"

# Define Multi-Modal Model
class MultiModalModel(torch.nn.Module):
    def __init__(self, text_model_name, image_model_name, numerical_input_size, output_classes):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.image_encoder = torch.hub.load("pytorch/vision", image_model_name, pretrained=True)
        self.image_encoder.fc = torch.nn.Identity()
        self.fc_numeric = torch.nn.Linear(numerical_input_size, 128)
        self.fc_combined = torch.nn.Linear(512 + 768 + 128, output_classes)

    def forward(self, text_tokens, image_tensor, numerical_data):
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]
        image_features = self.image_encoder(image_tensor)
        numeric_features = self.fc_numeric(numerical_data)
        combined = torch.cat((text_features, image_features, numeric_features), dim=1)
        return self.fc_combined(combined)

# Load Model
MODEL_PATH = "multi_modal_model.pth"

if os.path.exists(MODEL_PATH):
    model = MultiModalModel(
        text_model_name="bert-base-uncased",
        image_model_name="resnet18",
        numerical_input_size=len(NUMERICAL_COLUMNS),
        output_classes=len(df[LABEL_COLUMN].unique())
    )
    model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu")))
    model.eval()
else:
    st.error(f"Model file '{MODEL_PATH}' not found. Please train and save the model first.")
    st.stop()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Streamlit UI
st.title("Multi-Modal Prediction")

# Input fields
symptoms = st.text_area("Enter Symptoms")
heart_rate = st.number_input("Heart Rate", min_value=50, max_value=200, value=80)
temperature = st.number_input("Body Temperature", min_value=30.0, max_value=45.0, value=37.0)
image_file = st.file_uploader("Upload Image", type=["jpg", "png"])

if st.button("Predict"):
    if symptoms and image_file:
        image_path = "uploaded_image.jpg"
        with open(image_path, "wb") as f:
            f.write(image_file.read())

        # Process input
        text_tokens = tokenizer(symptoms, return_tensors="pt", padding=True, truncation=True)
        image = Image.open(image_path).convert("RGB")
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])
        image_tensor = transform(image).unsqueeze(0)
        numerical_data = torch.tensor([[heart_rate, temperature]], dtype=torch.float32)

        with torch.no_grad():
            prediction = model(text_tokens, image_tensor, numerical_data)
            diagnosis = "Positive" if torch.argmax(prediction, dim=1).item() == 1 else "Negative"

        st.success(f"Prediction: {diagnosis}")
    else:
        st.warning("Please enter symptoms and upload an image")


Writing streamlit_app.py


In [9]:
!pip install streamlit pyngrok transformers torch torchvision pandas


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting n

In [10]:
%%writefile setup.sh
mkdir -p ~/.streamlit/
echo "\
[server]\n\
headless = true\n\
enableCORS=false\n\
port = 8501\n\
" > ~/.streamlit/config.toml


Writing setup.sh


In [12]:
!ngrok config add-authtoken 2sh8tZRTzmljkt4YiY7hv3Qv1SI_fY1azBK9xS6q9eqN2vzp


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
!pip install pyngrok
from pyngrok import ngrok

# Kill any running processes on port 8501
!fuser -k 8501/tcp

# Start ngrok
public_url = ngrok.connect(8501, "http")
print("Ngrok Tunnel URL:", public_url)



Ngrok Tunnel URL: NgrokTunnel: "https://2e5f-34-142-228-195.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!streamlit run streamlit_app.py &





Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.142.228.195:8501[0m
[0m
