In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.inspection import plot_partial_dependence
from sklearn.tree import export_graphviz

In [2]:
def _sigmoid_range(x, low, high):
    return torch.sigmoid(x) * (high - low) + low

In [3]:
class myModule(torch.nn.Module):
    @staticmethod
    def _create_params(*size):
        return torch.nn.Parameter(torch.zeros(size).normal_(0, 0.01))
    
    def __init__(self, n_users, n_movies, n_factors):
        super().__init__()
        self.user_factors = self._create_params(n_users, n_factors)
        self.user_bias = self._create_params(n_users, 1)
        self.movie_factors = self._create_params(n_movies, n_factors)
        self.movie_bias = self._create_params(n_movies, 1)
    
    def forward(self, x):
        """x is a matrix with first column = indecies of users, second of items| we return vector of y hats"""
        users = self.user_factors[x[:,0]]
        movies = self.movie_factors[x[:,1]]
        pred = (users * movies).sum(dim=1, keepdim=True)
        pred += self.user_bias[x[:,0]] + movie_bias[x[:,1]]
        return _sigmoid_range(pred, 0, 5.5)
        

In [4]:
class CollabNN(torch.nn.Module):
    def __init__(self, user_size, item_size, n_act=100, y_range=(0,5.5)):
        super().__init__()
        self.user_factors = torch.nn.Embedding(*user_size)
        self.item_factors = torch.nn.Embedding(*item_size)
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(user_size[1]+item_size[1], n_act),
            torch.nn.ReLU(),
            torch.nn.Linear(n_act, 1),
        )
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]), self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return _sigmoid_range(x, *self.y_range)

In [5]:
DATA_PATH = Path.cwd()/'Data'
RAND_STATE = 0

In [6]:
full_df = pd.read_csv(DATA_PATH/'Final_data.csv', low_memory=False)

#### Splitting

In [7]:
#Adding categorical age column for proportional/stratified train/test split
AGE_GROUP_AMOUNT = 8
full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT,
                              labels=range(AGE_GROUP_AMOUNT))

#Making train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1,
                               random_state=RAND_STATE)
for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
    strat_train_set = full_df.iloc[train_index]
    strat_test_set = full_df.iloc[test_index]

In [8]:
#TODO: implment StandardScaler in pytorch (take a look at nn.Flatten) and maybe the whole preprocessing pipeline in PyTorch

In [9]:
drop_cols = ['SEQN', 'AGE_GROUP', 'AGE', 'AlkPhos_UL']
cat_cols = ["GENDER"]

full_transform = ColumnTransformer([
    ('encoder', OrdinalEncoder(), cat_cols),
    ('scaler', StandardScaler(), [x for x in strat_train_set.columns if x not in drop_cols + cat_cols]),
])
x = full_transform.fit_transform(strat_train_set)

In [10]:
pd.DataFrame(x, columns=[x for x in strat_train_set.columns if x not in drop_cols])

Unnamed: 0,GENDER,Albumin_mgl,Glucose_mmolL,Urea_mmolL,Cholesterol_mmolL,Protein_gdL,Sodium_mmolL,Creatinine_mgdl,Hemoglobin_gdl,Bilirubin_umolL,Triglyceride_mmolL,HDL_mmolL,LDL_mmolL,Calcium_mmolL,Potassium_mmolL,Hematocrit_%,MCHC_gdl,MCV_fL,Platelet_TuL,RBC_MuL
0,1.0,-0.095714,-0.121959,0.416988,-1.246429,-0.400086,0.730129,0.440837,1.179858,2.960441,-0.203676,1.178325,-1.886586,-0.168364,0.748710,0.583150,2.122749,1.723772,0.444329,-0.442279
1,1.0,-0.125455,0.770896,0.080677,-0.050799,-2.239679,-0.544053,-0.877601,-0.366375,0.346476,-0.484852,0.116005,0.069395,-1.258543,-0.394229,-0.823202,1.822813,-0.085715,1.423558,-0.773718
2,1.0,-0.125187,-0.301066,-0.596682,-0.173186,-1.013283,1.154856,-0.967204,0.793300,-0.960507,-0.454773,0.743740,-0.348817,-0.985998,-0.108494,0.651200,0.623068,0.887311,0.236613,0.064627
3,0.0,-0.110451,-0.035087,0.251201,1.135417,0.417512,-0.544053,-0.916002,-0.495227,-0.307016,-0.677097,0.116005,1.522873,-0.440909,-1.365726,-0.482956,-0.176761,0.136203,0.429492,-0.578754
4,1.0,-0.127331,1.181663,0.923823,-0.370889,0.621911,-0.119326,-0.019976,0.664447,0.019730,0.282822,-0.391011,-0.348817,-0.440909,-1.251432,0.809982,-0.276740,0.511757,-0.742616,0.415562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20076,1.0,-0.108307,-0.181483,-0.089847,1.182489,-1.217683,-0.119326,-0.147980,0.857726,0.673221,-0.160519,-0.753166,1.774665,0.104181,1.891648,0.855348,0.223153,-0.307634,-0.505227,0.980957
20077,1.0,-0.129474,-0.617453,-0.089847,-0.276745,0.008713,0.730129,-1.415217,0.600021,0.019730,-1.107361,0.429872,-0.093783,2.284538,1.320179,0.651200,-0.076783,0.819029,-0.579411,0.123116
20078,1.0,-0.117685,-0.151989,0.587512,1.333120,-0.195686,0.730129,-0.877601,-0.237522,-0.633761,-0.189291,0.623021,1.327275,0.921815,1.320179,0.061439,-1.176548,-1.519649,-1.187720,1.058943
20079,1.0,-0.041858,-0.807822,-0.089847,0.335192,-0.400086,0.730129,-0.083978,0.020184,0.673221,0.723549,-1.574049,0.824771,1.739449,0.748710,0.129488,-0.276740,0.597110,-0.327185,-0.266811
