In [16]:
import os
import torch
import numpy as np
from pathlib import Path
from torchvision import datasets
from torch.utils.data import DataLoader
import open_clip
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pickle

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
model,_, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion2b_s34b_b88k')
model = model.to(device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [19]:
train_dir = Path(r"E:\POC_Jain_Irrigation\data\images\train")
fw_test_dir = Path(r"E:\POC_Jain_Irrigation\data\images\val")  # forward test

In [20]:
def extract_embeddings(folder_path, batch_size=16):
    dataset = datasets.ImageFolder(folder_path, transform=preprocess)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_embeddings, all_labels = [], []

    with torch.no_grad():
        for imgs, lbls in tqdm(loader, desc=f"Extracting from {folder_path.name}"):
            imgs = imgs.to(device)
            feats = model.encode_image(imgs)
            feats = feats / feats.norm(dim=-1, keepdim=True)
            all_embeddings.append(feats.cpu().numpy())
            all_labels.extend(lbls.numpy())

    embeddings = np.vstack(all_embeddings)
    labels = np.array(all_labels)
    class_names = dataset.classes
    return embeddings, labels, class_names


In [21]:
X_all, y_all, class_names = extract_embeddings(train_dir)
print("Total embeddings shape:", X_all.shape)

Extracting from train: 100%|███████████████████████████████████████████████████████████| 21/21 [00:15<00:00,  1.40it/s]

Total embeddings shape: (332, 512)





In [22]:
# --- Step 2: Split TRAIN into 70:30 train/test ---
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all,
    test_size=0.3,
    stratify=y_all,
    random_state=42
)
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (232, 512) Test: (100, 512)


In [23]:
X_fw, y_fw, _ = extract_embeddings(fw_test_dir)
print("Forward Test:", X_fw.shape)

Extracting from val: 100%|███████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.70it/s]

Forward Test: (97, 512)





In [24]:
os.makedirs("embeddings", exist_ok=True)

with open("embeddings/clip_train.pkl", "wb") as f:
    pickle.dump((X_train, y_train, class_names), f)

with open("embeddings/clip_test.pkl", "wb") as f:
    pickle.dump((X_test, y_test, class_names), f)

with open("embeddings/clip_forward.pkl", "wb") as f:
    pickle.dump((X_fw, y_fw, class_names), f)

print("✅ Saved embeddings to /embeddings folder")

✅ Saved embeddings to /embeddings folder


In [25]:
import pandas as pd
import numpy as np

# Example for forward test embeddings
df_fw = pd.DataFrame(X_fw)

# Add labels (optional)
df_fw["label"] = y_fw

# Save as CSV
df_fw.to_csv("embeddings/clip_forward.csv", index=False)

print("✅ Saved embeddings/clip_forward.csv")

✅ Saved embeddings/clip_forward.csv


In [26]:
import pandas as pd
import numpy as np

# Example for forward test embeddings
df_train = pd.DataFrame(X_train)

# Add labels (optional)
df_train["label"] = y_train

# Save as CSV
df_train.to_csv("embeddings/clip_train.csv", index=False)

print("✅ Saved embeddings/clip_train.csv")

✅ Saved embeddings/clip_train.csv


In [27]:
import pandas as pd
import numpy as np

# Example for forward test embeddings
df_test = pd.DataFrame(X_test)

# Add labels (optional)
df_test["label"] = y_test

# Save as CSV
df_test.to_csv("embeddings/clip_test.csv", index=False)

print("✅ Saved embeddings/clip_test.csv")

✅ Saved embeddings/clip_test.csv
