# Imports

In [1]:
from typing import Tuple, List

# import librairies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

# import machine learning packages
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import MNIST, FashionMNIST

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import torchvision




# Data recuperation

In [2]:
# recuperate the complete data set
X_train_all = pd.read_csv('train_set.csv')
X_val_all=pd.read_csv('val_set.csv')
X_test_all=pd.read_csv('test_set.csv')

# convert into numpy arrays
X_train_all= X_train_all.to_numpy()
X_val_all= X_val_all.to_numpy()
X_test_all=X_test_all.to_numpy()


print('Training set shape:')
print(f'X: {X_train_all.shape} ')

print('\nVal set shape:')
print(f'X: {X_val_all.shape} ')

print('\nTest set shape:')
print(f'X: {X_test_all.shape} ')


Training set shape:
X: (12646, 111) 

Val set shape:
X: (1500, 111) 

Test set shape:
X: (3000, 110) 


# Data reshape and transformation

In [3]:
# splitting our data for the training in input (X_train) and output (y_train)
X_train= X_train_all[:,0:110].astype(np.float32)
y_train= X_train_all[:,110].astype(np.float32)


# splitting our data for the validation in input (X_val) and output (y_val)
X_val= X_val_all[:,0:110].astype(np.float32)
y_val= X_val_all[:,110].astype(np.float32)
'''
These data are equivalent to test_set in the conventions but we use them
to get an idea of the performance before having the test_set evaluated on AIcrowd 
'''

# separation of our data for the test set into input (X_test)
X_test= X_test_all.astype(np.float32)


# transform numpy array in a TensorDataset
'''
Use unsqueeze to counter this error msg  : UserWarning: Using a target size (torch.Size([1])) that is different 
to the input size (torch.Size([1, 1])). This will likely lead to incorrect results due to broadcasting.
'''
train = torch.utils.data.TensorDataset(torch.from_numpy(X_train),torch.unsqueeze(torch.from_numpy(y_train),1))
validation = torch.utils.data.TensorDataset(torch.from_numpy(X_val),torch.unsqueeze(torch.from_numpy(y_val),1))

test=torch.from_numpy(X_test)

# separate features and targets of the TensorDataset 
train_loader = torch.utils.data.DataLoader(train, batch_size=12, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation, batch_size=1, shuffle=True)



# Neural network

To prevent overfitting, we use a dropout in the model.

In [4]:
# define our model

class model(nn.Module):
    
    # fully connected neural network
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(110,66) 
        self.fc2 = nn.Linear(66,20)
        self.dropout = nn.Dropout(0.1)
        self.fc3 = nn.Linear(20,1)
        
    # architecture of our model and activation function
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.softsign(self.fc1(x)) 
        x = F.softsign(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
  
        return x

model=model()

# Loss &amp; optimizer

In [5]:
loss_fn =nn.MSELoss(size_average=None, reduce=None, reduction='mean')

optimizer = optim.RMSprop(model.parameters(),lr=0.0011)
#optimizer = optim.Adadelta(model.parameters(), lr=0.005, rho=0.95, eps=1e-06, weight_decay=0)
#optimizer = optim.AdamW(model.parameters(),lr=0.003,betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)
#optimizer = optim.Adagrad(model.parameters(), lr=0.004, lr_decay=0.00001, weight_decay=0, initial_accumulator_value=0, eps=1e-10)
#optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
#optimizer = optim.Adam(model.parameters(),lr=0.003,betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)


# Loss metric function 

We import this function from a course exercice : https://github.com/vita-epfl/introML-2021/blob/main/exercises/06-neural-nets/metrics.py

In [6]:
# compute the loss and keeps track of the loss over an epoch
class LossMetric:

    def __init__(self) -> None:
        self.running_loss = 0
        self.count = 0

    def update(self, loss: float, batch_size: int) -> None:
        self.running_loss += loss * batch_size
        self.count += batch_size

    def compute(self) -> float:
        return self.running_loss / self.count

    def reset(self) -> None:
        self.running_loss = 0
        self.count = 0

# Model training

In [7]:
def train(model: torch.nn.Module, train_loader : torch.utils.data.DataLoader, loss_fn: torch.nn.Module, optimizer: torch.optim.Optimizer, epochs: int):
    
    # initialization of the loss
    loss_metric = LossMetric()
    
    # sets the module in training mode
    model.train()
    
    for epoch in range(1,epochs+1): 

        # iterate through data
        for data, target in train_loader:
            
            # zero-out the gradients
            optimizer.zero_grad()

            # forward pass
            out = model(data)
            
            # compute the loss
            loss = loss_fn(out,target)
            
            # backward pass
            loss.backward()
            
            # optimizer step
            optimizer.step()            
            
            # update metrics
            loss_metric.update(loss.item(), data.shape[0])
        
       
        # end of epoch, show loss
        print("Train loss :"+ str(loss_metric.compute()))
        
        loss_metric.reset()

In [8]:
train(model, train_loader, loss_fn, optimizer, epochs=50)

Train loss :0.241384721259617
Train loss :0.19489124605484293
Train loss :0.1864510549156656
Train loss :0.18483087034030238
Train loss :0.1795970274878702
Train loss :0.17786516900207167
Train loss :0.17602475121718653
Train loss :0.17366573872156918
Train loss :0.17273469367077054
Train loss :0.17077453625828107
Train loss :0.16885798047711328
Train loss :0.16878369662815845
Train loss :0.16732871333184582
Train loss :0.1662424870828709
Train loss :0.16480386687203163
Train loss :0.16553187980174444
Train loss :0.16302973325666975
Train loss :0.16382104033940384
Train loss :0.16116583564310027
Train loss :0.16106468824592832
Train loss :0.15868962058043642
Train loss :0.1598027742248504
Train loss :0.15897073837861161
Train loss :0.15845278814192268
Train loss :0.15830271278403282
Train loss :0.15744320984570997
Train loss :0.15810405183303644
Train loss :0.15650036613743093
Train loss :0.15534775525761
Train loss :0.15436825447053157
Train loss :0.15461603258286458
Train loss :0.155

# Model evaluation

In [9]:
def validation(model: torch.nn.Module, validation_loader : torch.utils.data.DataLoader):
     
    MSEloss= nn.MSELoss(reduction='none') 
    model.eval()
    
    loss_metric= []

    with torch.no_grad(): 

        # iterate through data
        for data, target in validation_loader:
            
            # forward pass and update our loss_metrics array with .tolist()
            out = model(data)
            loss_metric += MSEloss(out, target).tolist()
            
    # transform list in array to use "sum"
    loss_metric_array = np.asarray(loss_metric) 
    
    # mean of the loss
    final_loss = (sum(loss_metric_array)/len(loss_metric))
    print((final_loss))
    

In [10]:
validation(model, validation_loader) 


[0.14333816]


#  .csv submission file generation

In [11]:
ok = model(test).detach().numpy()

df = pd.DataFrame(ok,columns=["sat1_col"])
print(df)
print('***')
df.to_csv("sat1_col.csv", index = False)


      sat1_col
0     0.747528
1     1.285604
2     1.298330
3     1.413915
4     2.439812
...        ...
2995  0.747444
2996  1.139007
2997  1.251950
2998  1.160040
2999  1.999687

[3000 rows x 1 columns]
***


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7069e5d5-d80e-47ef-99c8-07f7aae62cb7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>