In [115]:
# !git clone https://github.com/OmdenaAI/milan-chapter-agrifoods.git

In [116]:
# %cd /content/milan-chapter-agrifoods/src/tasks/task_4 Modeling

In [117]:
# %cd Image\ modelling

In [118]:
import numpy as np
from pathlib import Path

path_to_histogram =  './data/histogram/histogram_all_full.npz'

with np.load(path_to_histogram) as hist:
    images = hist["output_image"]
    locations = hist["output_locations"]
    yields = hist["output_yield"]
    years = hist["output_year"]
    indices = hist["output_index"]

In [130]:

def _normalize(train_images, val_images):
        """
        Find the mean values of the bands in the train images. Use these values
        to normalize both the training and validation images.

        """
        mean = np.mean(train_images, axis=(0, 2, 3))

        train_images = (train_images.transpose(0, 2, 3, 1) - mean).transpose(0, 3, 1, 2)
        val_images = (val_images.transpose(0, 2, 3, 1) - mean).transpose(0, 3, 1, 2)

        return train_images, val_images


In [131]:
pred_years = 2019 #range(2019, 2020)


train_idx = np.nonzero(years < 2019)[0]
test_idx = np.nonzero(years == 2019)[0]

train_images, test_images = _normalize(images[train_idx], images[test_idx])


In [132]:
from collections import defaultdict, namedtuple


print(
    f"Train set size: {train_idx.shape[0]}, Test set size: {test_idx.shape[0]}"
)

Data = namedtuple("Data", ["images", "yields", "locations", "indices", "years"])


Train set size: 486, Test set size: 73


In [133]:
import torch
time = 32
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_data = Data(
    images=torch.as_tensor(
        train_images[:, :, :time, :], device=device
    ).float(),
    yields=torch.as_tensor(yields[train_idx], device=device)
    .float()
    .unsqueeze(1),
    locations=torch.as_tensor(locations[train_idx]),
    indices=torch.as_tensor(indices[train_idx]),
    years=torch.as_tensor(years[train_idx]),
)

test_data = Data(
    images=torch.as_tensor(
        test_images[:, :, :time, :], device=device
    ).float(),
    yields=torch.as_tensor(yields[test_idx], device=device)
    .float()
    .unsqueeze(1),
    locations=torch.as_tensor(locations[test_idx]),
    indices=torch.as_tensor(indices[test_idx]),
    years=torch.as_tensor(years[test_idx]),
)


In [134]:
from torch.utils.data import TensorDataset, DataLoader, random_split

train_images =train_data.images
train_yields =train_data.yields
train_steps = 10000
batch_size = 4
starter_learning_rate = 0.001

total_size = train_images.shape[0]
# "Learning rates and stopping criteria are tuned on a held-out
# validation set (10%)."
val_size = total_size // 10
train_size = total_size - val_size
print(
    f"After split, training on {train_size} examples, "
    f"validating on {val_size} examples"
)
train_dataset, val_dataset = random_split(
    TensorDataset(train_images, train_yields), (train_size, val_size)
)

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(
    test_data.images, test_data.yields, test_data.locations, test_data.indices, test_data.years
)

test_dataloader = DataLoader(test_dataset, batch_size=1)

num_epochs = int(train_steps / (train_images.shape[0] / batch_size))
print(f"Training for {num_epochs} epochs")

train_scores = defaultdict(list)
val_scores = defaultdict(list)

step_number = 0
min_loss = np.inf
# best_state = self.model.state_dict()

After split, training on 438 examples, validating on 48 examples
Training for 82 epochs


In [135]:
import torchvision
from torch import nn
class Model_(nn.Module):

    def __init__(self):
        super(Model_, self).__init__()

        #===================================>
        self.resnet = torchvision.models.resnet18(pretrained=True)
        self.resnet.conv1 = nn.Conv2d(9, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)        
        self.fc_in_features = self.resnet.fc.in_features        
        self.resnet = torch.nn.Sequential(*(list(self.resnet.children())[:-1]))
        #===================================>
        self.fc = nn.Sequential(
            nn.Linear(self.fc_in_features, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 1),
        )

        self.sigmoid = nn.Sigmoid()


        self.resnet.apply(self.init_weights)

        self.fc.apply(self.init_weights)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform(m.weight)
            m.bias.data.fill_(0.01)

    def forward_once(self, x):
        output = self.resnet(x)
        output = output.view(output.size()[0], -1)
        return output

    def forward(self, input1):
        output = self.forward_once(input1)


        output = self.fc(output)

        
        return output

In [136]:
for id,data in enumerate(test_dataloader):
    # get the inputs
    the_tensor, labels = data[0],data[1]
    print(labels)

    if id>3:
      break

tensor([[58.7970]])
tensor([[43.8031]])
tensor([[26.]])
tensor([[47.1855]])
tensor([[34.7500]])


In [137]:
import torch.nn.functional as F


def l1_l2_loss(pred, true, l1_weight, scores_dict):
    """
    Regularized MSE loss; l2 loss with l1 loss too.

    Parameters
    ----------
    
    Returns
    ----------
    loss: the regularized mse loss
    """
    loss = F.mse_loss(pred, true)

    scores_dict["l2"].append(loss.item())

    if l1_weight > 0:
        l1 = F.l1_loss(pred, true)
        loss += l1
        scores_dict["l1"].append(l1.item())
    scores_dict["loss"].append(loss.item())

    return loss, scores_dict

In [127]:
!pip install tqdm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [128]:
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from torch import optim
import pandas as pd
from tqdm import tqdm

use_gpu =torch.cuda.is_available()
criterion = nn.MSELoss()

def validation(Model,val_dataloader):
    # fig = plt.figure()
    val_loss=[]
    target=[]
    running_loss=0.0
    counter=0
    Model.eval()
    for data in tqdm(val_dataloader)  :
        counter+=1
        input1, labels = data
        
        # wrap them in Variable
        if use_gpu:
            input1 = Variable(input1.cuda())

            labels = Variable(labels.cuda())
        else:
            input1,  labels = Variable(input1),  Variable(labels)

        # forward
        outputs = Model(input1)  

        loss = criterion(outputs, labels)


        # statistics
        running_loss += loss.item()

    epoch_loss = running_loss / counter

    return epoch_loss 
def teste(Model,test_dataloader):
    # fig = plt.figure()
    val_loss=[]
    target=[]
    running_loss=0.0
    counter=0
    Model.eval()
    for id,data in enumerate(test_dataloader)  :
        counter+=1
        input1, labels = data[0],data[1]
        
        # wrap them in Variable
        if use_gpu:
            input1 = Variable(input1.cuda())

            labels = Variable(labels.cuda())
        else:
            input1,  labels = Variable(input1),  Variable(labels)

        # forward
        outputs = Model(input1)  

        loss = criterion(outputs, labels)


        # statistics
        running_loss += loss.item()

    epoch_loss = running_loss / counter

    return epoch_loss   
def training_step(MODEL,
                  train_dataloader,
                  val_dataloader,
                  test_dataloader,                  
                  num_epochs=10,
                  model_name='reg_resnet',
                  root_save='./content/'):

  criterion = nn.MSELoss()
  model_s=MODEL()
  use_gpu =torch.cuda.is_available()

  
  if use_gpu:

      model_s = model_s.cuda()

  optimizer = optim.Adam(model_s.parameters(), lr=1e-3, weight_decay=0.0005)

  # optimizer=optim.Adam(model.parameters(),lr=3e-4,weight_decay=6e-5)
  scheduler = StepLR(optimizer, step_size=20, gamma=0.7)
  # num_epochs=100
  train_loss=[]
  val_loss = []
  test_loss =[]
  oldLoss=np.Inf
  prefixe =''
  for epoch in range(num_epochs):


      # Each epoch has a training and validation phase

      running_train_scores = defaultdict(list)
      l1_weight = 0
      running_loss = 0.0
      # running_corrects = 0
      counter=0
      # Iterate over data.
      for data in tqdm(train_dataloader):
          # get the inputs
          input1, labels = data
          counter=counter+1
          # wrap them in Variable
          if use_gpu:
              input1 = Variable(input1.cuda())

              labels = Variable(labels.cuda())
          else:
              input1, labels = Variable(input1), Variable(labels)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward
          outputs = model_s(input1)
          loss = criterion(outputs, labels)
          loss.backward()

          optimizer.step()

          running_loss += loss.item()
        

      epoch_loss = running_loss / counter
      train_loss.append(epoch_loss)
      scheduler.step()

      if epoch %10 ==0 :
        loss_val=validation(model_s,val_dataloader)

        if oldLoss>loss_val and counter>0:
            oldLoss=loss_val
            Models=model_s
            torch.save(Models.state_dict(), root_save+prefixe+model_name+'_.pt')
        
        # loss_val=validation(model_s,val_dataloader)
        val_loss.append(loss_val)
      # epoch_acc = running_corrects / dataset_sizes[phase]
        df_train= pd.DataFrame(data=train_loss)
        df_val= pd.DataFrame(data=val_loss)
        # df_test = pd.DataFrame(data=test_loss)
        df_train.to_csv(root_save +prefixe+model_name+'_train_loss.csv')
        df_val.to_csv(root_save +prefixe+model_name+'_val_loss.csv')

        tloss = teste(model_s,test_dataloader)
        test_loss.append(tloss)
        df_test= pd.DataFrame(data=test_loss)
        df_test.to_csv(root_save +prefixe+model_name+'_test_loss.csv')        
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)        
        print('epoch: {} Loss: {:.4f}  val_loss: {} test_loss: {}'.format(
            epoch,epoch_loss,loss_val,tloss))
  # loss_val=validation(model_s,test_dataloader)
  # val_loss.append(loss_val)
        loss_val=validation(model_s,val_dataloader)

  if oldLoss>loss_val and counter>0:
      oldLoss=loss_val
      Models=model_s
      torch.save(Models.state_dict(), root_save+prefixe+model_name+'_.pt')
    
  out={}
  out['test_loss']=test_loss
  out['val_loss']=val_loss
  out['train_loss'] = train_loss
  out['model'] = Models
  return out

In [None]:
MODEL = Model_
result=training_step(MODEL,
                  train_dataloader,
                  val_dataloader,
                  test_dataloader,
                  num_epochs=100,
                  model_name='reg_resnet',
                  root_save='./')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_percentage_error
def plots(  predict, target,title=None,root='./'):
    """
    Function to save the loss and accuracy plots to disk.
    """

    # loss plots
    plt.figure(figsize=(10, 7))
    plt.plot(
        predict, color='orange', linestyle='-', 
        label='Predict'
    )
    plt.plot(
        target, color='red', linestyle='-', 
        label='Target'
    )
    plt.xlabel('No Sample')
    plt.ylabel('weight')
    plt.title(title)
    plt.legend()
    plt.savefig(f"{root}{title}.png")
def visualize_predict_target(model,data,title=None,root='./'):
    images_so_far = 0
    # fig = plt.figure()
    predict=[]
    target=[]
    model.eval()
    for data in data:
        # get the inputs
        input1, labels = data[0],data[1]
        
        # wrap them in Variable
        if use_gpu:
            input1 = Variable(input1.cuda())

            labels = Variable(labels.cuda())
        else:
            input1, labels = Variable(input1),  Variable(labels)

        # forward
        outputs = model(input1)
        # print(outputs.size())
        predict.append(outputs[0].item())
        target.append(labels[0].item())
    plots(  predict, target,title,root)
    score={}

    score['mse']=mean_squared_error( target,predict)
    score['rmse']=mean_squared_error( target,predict,squared=False)

    score['r2score']=r2_score( target,predict)
    score['mape'] =mean_absolute_percentage_error( target,predict)
    print(score)
    return predict, target,score

In [None]:
predict, target,metrics= visualize_predict_target(result['model'],test_dataloader,title='test',root='./')