# Welcome to Pytorch!
After almost everyone using Keras to get good scores in the competition, I took the challenge to use the beloved Pytorch! <br>
My Previous tries were good, but they took **4 hours to run**
Now after I made changes as the great francescopochetti, my excecution time has **come down to 10 minutes** (WOHOOOO!) <br>
Have a look at the original work of Frances http://francescopochetti.com/pytorch-for-tabular-data-predicting-nyc-taxi-fares/


#### upvote if you find it useful. Sharing is the best way to learn!

Note: I will soon add all other references and will make the submission possible. <br>
**Can someone please comment on why I hit on a local minima in the end**

Ideas to imporve:
* Get a early stopping callback
* Get learning rate scheduler
* Make Network Deeper

## Simple imports

In [None]:
%matplotlib inline
import pathlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 500)
from collections import defaultdict


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error


## Pytroch imports

In [None]:
from torch.nn import init
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from tqdm import tqdm # , # tqdm_notebook, # tnrange
from tqdm.notebook import trange as tnrange # will change this to trange later 
from tqdm.notebook import tqdm as tqdm_notebook # will change this to tqdm later
tqdm.pandas(desc='Progress')

In [None]:
import gc
gc.collect()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import dataset

In [None]:
df = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
df.head(2)

### Setting investment id as categorical feature (just trying out!)

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

In [None]:
# setting as category feature
df['investment_id'] = df['investment_id'].astype('category')

## Defining some helper functions to make life easy later

In [None]:
def split_features(df):
    catf = ['investment_id']
    numf = [col for col in df.columns if col not in catf]
    
    for c in catf: 
        df[c] = df[c].astype('category').cat.as_ordered()
        df[c] = df[c].cat.codes + 1
    
    return catf, numf

In [None]:
def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)

In [None]:
df = df.loc[df['time_id']>400] # filter out old data

## make use of helper functions!

In [None]:
y = df['target']
df = df.drop(columns = ['target'], axis = 1)

In [None]:
catf, numf = split_features(df)

print(len(catf))
print(catf)

print(len(numf))
# numf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, random_state=1)
print(X_train.shape, X_test.shape)

In [None]:
y_range = (y_train.min()*1.2, y_train.max()*1.2)
print(y_range)

cat_sz = [(c, df[c].max()+1) for c in catf]
print(cat_sz)

emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
print(emb_szs)

## Define the Dataset by rewriting the data.Dataset module

In [None]:
class RegressionColumnarDataset(data.Dataset):
    def __init__(self, df, cats, y):
        self.dfcats = df[cats]
        self.dfconts = df.drop(cats, axis=1)
        
        self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.int64)
        self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        self.y = y.values.astype(np.float32)
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

In [None]:
trainds = RegressionColumnarDataset(X_train, catf, y_train)
valds = RegressionColumnarDataset(X_test, catf, y_test)

In [None]:
del X_train, X_test, y_train, y_test

In [None]:
# params = {'batch_size': 1024,
#           'shuffle': True,
#           'num_workers': 2,
#           'pin_memory': True}

traindl = data.DataLoader(trainds, batch_size = 1024, shuffle = False, num_workers = 2, pin_memory = True)
valdl = data.DataLoader(valds, batch_size = 2048, shuffle = True, num_workers = 2, pin_memory = True)

In [None]:
n_cont = len(df.columns)-len(catf)
n_cont

In [None]:
del df,trainds, valds

# Training!

## The Neural Network!

This may look complex (it does to me!),  but actually this is quite simple. Have a nice read and check it out. <br>
The model mainly uses Embedding layers for the categorical variable (investmentid) and simple dense layers otherwise

In [None]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn=True):
        super().__init__()
        
        for i,(c,s) in enumerate(emb_szs): 
            assert c > 1, f"cardinality must be >=2, got emb_szs[{i}]: ({c},{s})"
        
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont = n_emb, n_cont
        
        # embeddings are done, now concatatenate 
        szs = [n_emb + n_cont] + szs
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]])
        
        # simple lines to make sure the weights are initialised in a kaiming distribution
        for o in self.lins: nn.init.kaiming_normal_(o.weight.data)
            
        self.outp = nn.Linear(szs[-1], out_sz) # define output layer
        nn.init.kaiming_normal_(self.outp.weight.data)

        # define dropout layers
        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        
        # define batch normalisation layers
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn, self.y_range = use_bn, y_range

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
            
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
            
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.silu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        
#         if self.y_range:
#             x = torch.sigmoid(x) # compresses values between 0 and 1
#             x = x - torch.tensor(0.5, device = device)
#             x = x * 2
#             x = x*(self.y_range[1] - self.y_range[0])
#             x = x+self.y_range[0]
            
        return x.squeeze()

In [None]:
m = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=n_cont, 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[1000,500,250], 
                    drops=[0.001,0.01,0.01], 
                    y_range=y_range).to(device)

opt = optim.Adam(m.parameters(), 5e-2)
num_epochs = 10

lr = defaultdict(list)
tloss = defaultdict(list)
vloss = defaultdict(list)

### check if the model looks good

In [None]:
m

### overfit on one batch

In [None]:
device

In [None]:
for cat, cont, y in traindl:
#     print(cat, cont, y)
    cat.cuda()
    cont.to('cuda')
    y.to(device)
    print(cat.device, cont.device, y.device)
    
    break

In [None]:
cat.device

In [None]:
# compile the neural net
network = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=n_cont, 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[1000,500,250], 
                    drops=[0.001,0.01,0.01], 
                    y_range=y_range)

optimizer = optim.Adam(network.parameters(), lr=1e-1)

total_loss = []

for i in range(100):
    
    # loss
    loss = F.mse_loss(network(cat, cont), y)
    total_loss.append(loss)
    if (i%10 == 0):
        print("Step", i," loss:", loss.item())

    optimizer.zero_grad()
    # backprop
    loss.backward()  # update gradients
    optimizer.step() # update weights using gradients to minimize loss

In [None]:
plt.plot(total_loss)

In [None]:
1+1

## Fitting loop

In [None]:
def fit(model, train_dl, val_dl, loss_fn, opt, epochs = 3):
    num_batch = len(train_dl)
    for epoch in tnrange(epochs):   
        
        model.train()
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        
        for cat, cont, y in t:
            cat = cat.cuda()
            cont = cont.cuda()
            y = y.cuda()
            
            t.set_description(f'Epoch {epoch}')
            
            opt.zero_grad()
            pred = model(cat, cont)
            loss = loss_fn(pred, y)
            loss.backward()
            lr[epoch].append(opt.param_groups[0]['lr'])
            tloss[epoch].append(loss.item())
            
            opt.step()
            
            
            t.set_postfix(loss=loss.item())
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_acc = rmse(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        if val_dl:
            model.eval()
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            for cat, cont, y in tqdm_notebook(val_dl, leave=False):
                cat = cat.cuda()
                cont = cont.cuda()
                y = y.cuda()
                pred = model(cat, cont)
                loss = loss_fn(pred, y)
                
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred.cpu().data.numpy())
                total_loss_val += loss.item()
                vloss[epoch].append(loss.item())
                
            valacc = rmse(y_true_val, y_pred_val)
            valloss = total_loss_val/len(valdl)
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_rmse: {train_acc:.4f} | val_loss: {valloss:.4f} val_rmse: {valacc:.4f}')
        else:
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_rmse: {train_acc:.4f}')
    
    return lr, tloss, vloss

## Loop through training

In [None]:
lr, tloss, vloss = fit(model=m, train_dl=traindl, val_dl=valdl, loss_fn=F.mse_loss, opt=opt, epochs=num_epochs)

## Plot the results

In [None]:
t = [np.mean(tloss[el]) for el in tloss]
v = [np.mean(vloss[el]) for el in vloss]
p = pd.DataFrame({'Train Loss': t, 'Validation Loss': v, 'Epochs': range(num_epochs)})

_ = p.plot(x='Epochs', y=['Train Loss', 'Validation Loss'], 
           title='Train and Validation Loss over Epochs')

# Can someone please comment why the loss has hit a minima?

# Upvote if useful!