In [45]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [53]:
df = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip',compression='zip')
df_train, df_test = train_test_split(df, test_size=.2)
df_train

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
5541,182213,0.314014,0.673168,0.012818,0.000000,0.673168,0.040196,0.632972,0.111451,0.561717,...,0.000000,0.000000,0.078435,0.033016,0.000000,0.000000,0.0,0.033347,0.0,0.078104
21767,417909,0.280160,0.719840,0.000000,0.380317,0.339523,0.000000,0.339523,0.000000,0.339523,...,0.087453,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
56620,927058,0.022799,0.976792,0.000410,0.272120,0.704672,0.135305,0.569368,0.633682,0.070991,...,0.029363,0.043902,0.205812,0.383967,0.022656,0.438874,0.0,0.000000,0.0,0.172151
49079,814508,0.072447,0.927553,0.000000,0.000000,0.927553,0.162812,0.764741,0.025051,0.902502,...,0.000000,0.000000,0.025051,0.000000,0.000000,0.025051,0.0,0.000000,0.0,0.000000
48760,809872,0.570357,0.391818,0.037825,0.009470,0.382348,0.000000,0.382348,0.000000,0.382348,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25061,466032,0.572583,0.400265,0.027152,0.000000,0.400265,0.000000,0.400265,0.000000,0.400265,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
13771,304298,0.913679,0.066109,0.020211,0.000000,0.066109,0.000000,0.066109,0.000000,0.066109,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
49399,819310,0.143202,0.856798,0.000000,0.856798,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.077832,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
10870,261316,0.908861,0.051148,0.039991,0.000000,0.051148,0.000000,0.051148,0.000000,0.051148,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000


In [61]:
!unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip -d ../temp/
!unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip -d ../temp/

In [5]:
class CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label, dtype=torch.float32)

In [6]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

In [7]:
image_paths = [f'../temp/images_training_rev1/{id}.jpg' for id in df_train.GalaxyID]
labels = df_train.values[:,1:]

In [8]:
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42)

In [9]:
train_dataset = CustomDataset(train_image_paths, train_labels, transform=transform)
val_dataset = CustomDataset(val_image_paths, val_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, 37)  

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = CNN().to(device) 

criterion = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validar o Modelo
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in (val_loader):
            images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

Epoch 1/10, Train Loss: 0.0229, Val Loss: 0.0185
Epoch 2/10, Train Loss: 0.0174, Val Loss: 0.0169
Epoch 3/10, Train Loss: 0.0156, Val Loss: 0.0155
Epoch 4/10, Train Loss: 0.0142, Val Loss: 0.0149
Epoch 5/10, Train Loss: 0.0131, Val Loss: 0.0148
Epoch 6/10, Train Loss: 0.0119, Val Loss: 0.0146
Epoch 7/10, Train Loss: 0.0110, Val Loss: 0.0147
Epoch 8/10, Train Loss: 0.0100, Val Loss: 0.0151
Epoch 9/10, Train Loss: 0.0092, Val Loss: 0.0153
Epoch 10/10, Train Loss: 0.0085, Val Loss: 0.0155


In [12]:
test_image_paths = [f'../temp/images_training_rev1/{id}.jpg' for id in df_test.GalaxyID]
test_labels = df_test.values[:,1:]

test_dataset = CustomDataset(test_image_paths, test_labels, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [13]:
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for images, labels in tqdm(test_loader):
        images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
        outputs = model(images)
        true_labels.extend(labels.cpu().numpy())  # Transferir para CPU e converter para numpy
        predicted_labels.extend(outputs.cpu().numpy())  # Transferir para CPU e converter para numpy

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(true_labels, predicted_labels))
print(f'RMSE: {rmse:.4f}')

RMSE: 0.1254


In [67]:
predictions = []

for image in tqdm(sub_image):
    
    image = Image.open(f'../temp/images_test_rev1/{image}')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(image)
        predictions.append(outputs.cpu().numpy())

100%|██████████| 79975/79975 [04:00<00:00, 332.52it/s]


In [70]:
results = np.vstack(predictions)
ids = np.array([image.split('.')[0] for image in sub_image]).reshape(len(sub_image),1)

In [71]:
submission_df = pd.DataFrame(np.hstack((ids, results)), columns=df.columns)
submission_df

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,147518,0.60341775,0.3923319,0.04699923,0.0035967827,0.37197778,0.021889295,0.35561067,0.028494693,0.34310257,...,-0.00017979741,0.079163976,-0.02275611,-0.018549772,0.020238195,-0.065236405,-0.00696285,0.003627223,0.0084882295,0.074638
1,878885,0.73695564,0.25120664,0.031072862,-0.007894658,0.25182652,0.028925687,0.22521822,0.05026113,0.20139469,...,0.0062233657,0.05294822,-0.0002432093,-0.00028851442,0.0063531725,-0.007439574,-0.006401783,0.001338019,0.0003002975,0.055948485
2,795253,0.21835218,0.7503642,0.03126426,0.06008044,0.6866957,0.21427307,0.4727007,0.26677775,0.41893423,...,0.019132275,0.10383281,0.09579985,0.06862358,0.035745006,0.1258452,-0.0028724694,0.0027348083,0.006049863,0.10379934
3,639030,0.31407064,0.70913076,0.0066841394,0.36421764,0.3355069,0.104114726,0.23378691,0.10451826,0.22903907,...,0.11166996,0.051819265,0.032024775,0.022631198,0.006870771,0.049373064,-0.007193111,0.0017117113,0.0062877433,0.054918453
4,954128,0.69083947,0.2974718,0.029150628,0.002166368,0.28816986,0.01194033,0.27832627,0.07493143,0.21343857,...,0.017537352,0.072330914,0.0076336786,-0.0037610214,0.0049967486,-0.018080281,0.0023128767,0.006407679,0.0031792019,0.074549705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79970,320772,0.7385564,0.23948446,0.033939198,-0.05837129,0.2914592,0.007838912,0.28527337,0.081411146,0.20936808,...,-0.014421634,0.06546872,0.014244679,0.0054080747,0.018204942,0.022970658,-0.008550377,0.0007462185,0.0019122809,0.051902078
79971,100667,0.47654408,0.47964305,0.05248192,0.016748734,0.4535399,0.20691663,0.2489601,0.24557582,0.2085784,...,0.017353106,0.093683206,0.0904682,0.06729835,0.016410973,0.18051976,-0.0060551115,-0.0006534355,0.0013352279,0.05316134
79972,526907,0.3111856,0.59311724,0.04973662,0.0803811,0.52136105,0.112195976,0.40500575,0.2261785,0.29510313,...,0.029445702,0.12190879,0.07251358,0.030453367,0.021684876,0.04420991,0.0102213975,0.011646541,0.010189475,0.13014795
79973,839285,0.18051282,0.81715965,0.020294867,-0.008295678,0.81662977,0.04144335,0.778825,0.5508606,0.26620516,...,0.012326766,0.29661098,0.21650313,0.037416734,0.046565883,0.13681057,0.09373801,0.054372385,0.040323466,0.1882671


In [75]:
submission_df = submission_df.sort_values(by=['GalaxyID'])
submission_df.to_csv('submission.csv', index=False)