In [45]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
df = pd.read_csv('../input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip',compression='zip')
df_train, df_test = train_test_split(df, test_size=.2)

In [4]:
# !unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip -d ../temp/
# !unzip -q -n ../input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip -d ../temp/

In [5]:
class CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label, dtype=torch.float32)

In [6]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

In [7]:
image_paths = [f'../temp/images_training_rev1/{id}.jpg' for id in df_train.GalaxyID]
labels = df_train.values[:,1:]

In [8]:
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42)

In [9]:
train_dataset = CustomDataset(train_image_paths, train_labels, transform=transform)
val_dataset = CustomDataset(val_image_paths, val_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, 37)  

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = CNN().to(device) 

criterion = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validar o Modelo
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in (val_loader):
            images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

Epoch 1/10, Train Loss: 0.0229, Val Loss: 0.0185
Epoch 2/10, Train Loss: 0.0174, Val Loss: 0.0169
Epoch 3/10, Train Loss: 0.0156, Val Loss: 0.0155
Epoch 4/10, Train Loss: 0.0142, Val Loss: 0.0149
Epoch 5/10, Train Loss: 0.0131, Val Loss: 0.0148
Epoch 6/10, Train Loss: 0.0119, Val Loss: 0.0146
Epoch 7/10, Train Loss: 0.0110, Val Loss: 0.0147
Epoch 8/10, Train Loss: 0.0100, Val Loss: 0.0151
Epoch 9/10, Train Loss: 0.0092, Val Loss: 0.0153
Epoch 10/10, Train Loss: 0.0085, Val Loss: 0.0155


In [12]:
test_image_paths = [f'../temp/images_training_rev1/{id}.jpg' for id in df_test.GalaxyID]
test_labels = df_test.values[:,1:]

test_dataset = CustomDataset(test_image_paths, test_labels, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [13]:
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for images, labels in tqdm(test_loader):
        images, labels = images.to(device), labels.to(device)  # Transferir dados para GPU
        outputs = model(images)
        true_labels.extend(labels.cpu().numpy())  # Transferir para CPU e converter para numpy
        predicted_labels.extend(outputs.cpu().numpy())  # Transferir para CPU e converter para numpy

# Calcular RMSE
rmse = np.sqrt(mean_squared_error(true_labels, predicted_labels))
print(f'RMSE: {rmse:.4f}')

RMSE: 0.1254


In [42]:
sub_image = os.listdir(f'../temp/images_training_rev1/')

In [48]:
predictions = []

for image in tqdm(sub_image):
    
    image = Image.open(f'../temp/images_training_rev1/{image}')
    image = transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(image)
        predictions.append(outputs.cpu().numpy())

100%|██████████| 61578/61578 [03:07<00:00, 328.19it/s]


In [49]:
results = np.vstack(predictions)
ids = np.array([image.split('.')[0] for image in sub_image]).reshape(len(sub_image),1)

In [50]:
submission_df = pd.DataFrame(np.hstack((ids, results)), columns=df.columns)
submission_df

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
0,316396,0.25140926,0.73544526,0.021512754,-0.005046852,0.73622984,0.17490752,0.56289524,0.3972684,0.33886254,...,0.0036881417,0.19516891,0.15254283,0.04896471,0.032731473,0.15074605,0.02857461,0.01566343,0.007928741,0.16173345
1,309407,0.27799,0.61938334,0.05975744,0.19408818,0.43070728,0.14573798,0.28211164,0.16111428,0.27037263,...,0.06577088,0.09476547,0.03584744,0.030667888,0.0063716467,-0.026570346,0.01768407,0.021411162,0.023656953,0.11952995
2,918573,0.0865525,0.94110405,0.009008199,0.007858038,0.9203775,0.24745876,0.67919,0.76313835,0.15840581,...,0.0137341805,0.42113173,0.31372386,0.027039258,0.03751821,0.24074501,0.14301467,0.07811347,0.05468128,0.21595168
3,600583,0.4514381,0.53489673,0.036567137,0.026785381,0.50002503,0.113543525,0.38848013,0.16974342,0.33010566,...,0.019803613,0.082046345,0.052284453,0.036955252,0.02669957,0.050083537,0.0022677877,0.0038646208,0.0024500247,0.08588077
4,394852,0.22070347,0.76831174,0.026037782,0.05713151,0.7054758,0.28153786,0.42637676,0.46715325,0.23864347,...,0.0185281,0.19373788,0.19598621,0.07537415,0.033019584,0.20908518,0.050750613,0.028047383,0.019387491,0.1312769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61573,260502,0.07633561,0.88038313,0.02595371,0.05191854,0.8287275,0.4239653,0.4041868,0.5658824,0.26281548,...,0.026837815,0.20902649,0.24210799,0.11320618,0.030238578,0.3613566,0.031590205,0.014961265,0.008993219,0.1157233
61574,571868,0.45025462,0.60259557,0.007774979,0.6537857,-0.06922579,-0.068014465,0.0039040148,-0.035109155,-0.03636819,...,0.1634573,0.0077272607,0.001251325,-0.039685577,-0.00434657,-0.03531822,0.01567223,0.012193995,0.011465375,-0.01657144
61575,507526,0.6346317,0.32117584,0.035936877,0.0022550076,0.31573614,0.047933854,0.26801634,0.07190273,0.24329188,...,0.008938387,0.049301766,0.013458952,0.01245893,0.009571936,0.009524677,-0.0073551224,0.0037848921,0.0062581906,0.053856958
61576,174930,0.2824834,0.646804,0.0508044,0.0007765591,0.64838254,0.13410157,0.5125501,0.3037531,0.3441794,...,-0.003860671,0.14559716,0.11480572,0.044228666,0.03920914,0.077269346,0.03081248,0.02052565,0.015832024,0.12736684


In [51]:
submission_df = submission_df.sort_values(by=['GalaxyID'])
submission_df.to_csv('sample_submission.csv', index=False)