In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch.nn as nn
import torchvision.transforms as transforms

# Load the dataset
data_path = '/content/drive/MyDrive/Colab Notebooks/Monument_dataset_new/Images'
labels_file = '/content/drive/MyDrive/Colab Notebooks/Monument_dataset_new/Groundtruth values/labels.xlsx'
labels_df = pd.read_excel(labels_file)

labels_df['Image ID'] = labels_df['Image ID'].astype(str)
extra_columns = ['Latitude', 'Longitude', 'State', 'City', 'Description', 'Architecture Style']
images = [os.path.join(data_path, f"image{img}.png") for img in labels_df['Image ID'].tolist()]
labels = labels_df['Monument Name'].tolist()
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
encoded_labels = [label_mapping[label] for label in labels]

# Splitting Into  training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, encoded_labels, test_size=0.2, random_state=42)


transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10)

])

class MonumentDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        image = Image.open(image_path).convert('RGB')
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label
train_dataset = MonumentDataset(X_train, y_train, transform=transform)
test_dataset = MonumentDataset(X_test, y_test, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

#ViT
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=len(label_mapping), ignore_mismatched_sizes=True)



optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
def train(model, train_loader, optimizer, criterion, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

def get_monument_details(monument_name, labels_df):
    details = labels_df[labels_df['Monument Name'] == monument_name][extra_columns].iloc[0].to_dict()
    return details
def evaluate(model, test_loader, label_mapping, labels_df):
    model.eval()
    all_preds = []
    all_labels = []
    inv_label_mapping = {v: k for k, v in label_mapping.items()}  # To map index back to monument name

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images).logits
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels)

    accuracy = accuracy_score(all_labels, all_preds)
    for pred in all_preds:
        monument_name = inv_label_mapping[pred]
        details = get_monument_details(monument_name, labels_df)
        print(f"Monument: {monument_name}, Details: {details}")

    return accuracy

# Training the model
train(model, train_loader, optimizer, criterion, num_epochs=5)

# Evaluating the model
accuracy = evaluate(model, test_loader, label_mapping, labels_df)
print(f'Test Accuracy: {accuracy:.4f}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([48]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([48, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Loss: 2.6738
Epoch [2/5], Loss: 0.5358
Epoch [3/5], Loss: 0.1035
Epoch [4/5], Loss: 0.0397
Epoch [5/5], Loss: 0.0280
Monument: Brihadisvara Temple, Details: {'Latitude': 10.78257995, 'Longitude': 79.13138189, 'State': 'Tamil Nadu', 'City': 'Thanjavur', 'Description': "The Brihadisvara Temple, a UNESCO World Heritage Site, is a magnificent example of Dravidian architecture and a testament to the Chola dynasty's grandeur. Built in the 11th century, it features a massive temple complex with intricate sculptures, towering gopurams (gateway towers), and a monumental vimana (temple tower).", 'Architecture Style': 'The temple showcases Dravidian architecture, characterized by its towering vimana, intricate stone carvings, and grand scale, reflecting the artistic and engineering prowess of the Chola period.'}
Monument: Sanchi Stupa, Details: {'Latitude': 23.4806919, 'Longitude': 77.73629684, 'State': 'Madhya Pradesh', 'City': 'The Sanchi Stupa is a UNESCO World Heritage Site, reno

In [None]:
from PIL import Image
import torch
import torchvision.transforms as transforms

#preprocess the input image
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image
def predict_monument(image_path, model, label_mapping, labels_df):
    image = preprocess_image(image_path)
    image = image.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(image).logits
        _, predicted_label_idx = torch.max(outputs, 1)
    predicted_label_idx = predicted_label_idx.item()
    inv_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_monument_name = inv_label_mapping[predicted_label_idx]
    # Retrieve monument details
    details = get_monument_details(predicted_monument_name, labels_df)

    return predicted_monument_name, details
# Example
image_path = '/content/tajamahal.jpg'
predicted_monument, monument_details = predict_monument(image_path, model, label_mapping, labels_df)

print(f"Predicted Monument: {predicted_monument}")
print(f"Details: {monument_details}")


Predicted Monument: Taj Mahal Agra
Details: {'Latitude': 27.1750123, 'Longitude': 78.0420968366131, 'State': 'Uttara Pradesh', 'City': 'Agra', 'Description': 'The Taj Mahal is a stunning white marble mausoleum located in Agra, India. Built by Mughal Emperor Shah Jahan between 1632 and 1653, it was constructed in memory of his wife, Mumtaz Mahal.', 'Architecture Style': 'Mughal architecture with influences from Persian, Islamic, and Indian architectural elements'}


In [None]:

# Displaying Mahabalipuram
image_path = '/content/1729521408129.jpg'
predicted_monument, monument_details = predict_monument(image_path, model, label_mapping, labels_df)

print(f"Predicted Monument: {predicted_monument}")
print(f"Details: {monument_details}")


Predicted Monument: Mahabalipuram 
Details: {'Latitude': 12.6208, 'Longitude': 80.1945, 'State': 'Tamil Nadu', 'City': 'Mahabalipuram (Mamallapuram)', 'Description': 'Mahabalipuram is a UNESCO World Heritage Site known for its ancient rock-cut temples, intricate stone carvings, and the famous Shore Temple, all dating back to the Pallava dynasty. It features monuments like cave temples, monolithic rathas, and bas-relief sculptures.', 'Architecture Style': 'The site showcases Dravidian architecture with rock-cut and monolithic temple designs, influenced by Pallava artistry.'}


In [None]:

# Displaying Golconda Fort
image_path = '/content/golcondafort.jpg'
predicted_monument, monument_details = predict_monument(image_path, model, label_mapping, labels_df)

print(f"Predicted Monument: {predicted_monument}")
print(f"Details: {monument_details}")


Predicted Monument: Golconda Fort Hyderabad
Details: {'Latitude': 17.3833665, 'Longitude': 78.4010991, 'State': 'Telangana', 'City': 'Hyderabad', 'Description': 'Golconda Fort is a historic fortress renowned for its impressive architecture, sprawling complex, and strategic location atop a hill. Built during the Qutb Shahi dynasty, it features magnificent gates, palaces, and a unique acoustic system, showcasing the ingenuity of ancient engineering.', 'Architecture Style': 'The fort exhibits Indo-Islamic architecture, characterized by its massive stone walls, intricate carvings, and a blend of Persian and Indian architectural elements.'}


In [None]:
# Training loop function (unchanged)
def train(model, train_loader, optimizer, criterion, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Calculate training accuracy during training loop
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)
        train_accuracy = correct_train / total_train * 100
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.2f}%')
# Evaluation function
def evaluate(model, data_loader, label_mapping):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = correct / total * 100
    return accuracy
# Train the model
train(model, train_loader, optimizer, criterion, num_epochs=3)
#train accuracy
train_accuracy = evaluate(model, train_loader, label_mapping)
print(f'Training Accuracy: {train_accuracy:.2f}%')
#test accuracy
test_accuracy = evaluate(model, test_loader, label_mapping)
print(f'Test Accuracy: {test_accuracy:.2f}%')


Epoch [1/3], Loss: 0.0164, Train Accuracy: 99.22%
Epoch [2/3], Loss: 0.0069, Train Accuracy: 99.61%
Epoch [3/3], Loss: 0.0054, Train Accuracy: 99.74%
Training Accuracy: 99.74%
Test Accuracy: 90.16%
