In [127]:
import torch
import torch.nn as nn
import torchviz
import sys; sys.path.insert(0, '../')
from exp import nb_d2l_utils, callback
import numpy as np

In [4]:
torch.__version__

'1.3.1'

In [28]:
### Build Model

In [18]:
class FeaturesLinear(torch.nn.Module):
    def __init__(self, field_dims, output_dim=1):
        """
        用一维 embedding 模拟线性函数
        计算每个特征对应的 offset 起始位置
        :param field_dims:
        :param output_dim:
        """
        super().__init__()
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)

    def forward(self, x):
        """
        输入是 特征的 labeled index
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim=1) + self.bias


class FeaturesEmbedding(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


class FactorizationMachine(torch.nn.Module):
    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix


class MultiLayerPerceptron(torch.nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout, batch_norm=False, output_layer=True):
        super().__init__()
        layers = list()
        for embed_dim in hidden_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            if batch_norm:
                layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        return self.mlp(x)



In [23]:
class WideAndDeepModel(torch.nn.Module):
    """
    Implementation of Wide&Deep

    Reference:
        HT Cheng, et al. Wide & Deep Learning for Recommender Systems, 2016.
    """

    def __init__(self, field_dims, embed_dim, hidden_dims, dropout):
        super().__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.linear = FeaturesLinear(field_dims)
        self.concat_embed_dim = len(field_dims) * embed_dim
        self.mlp = MultiLayerPerceptron(self.concat_embed_dim, hidden_dims, dropout)

    def forward(self, x):
        # wide
        y_linear = self.linear(x)
        # deep
        embed = self.embedding(x)
        y_mlp = self.mlp(embed.view(-1, self.concat_embed_dim))
        
        y = y_linear + y_mlp
        return torch.sigmoid(y.squeeze(1))
    

In [24]:
field_dims = [100]*10
model = WideAndDeepModel(field_dims, 8, [16, 16], 0.5)

In [25]:
model(torch.randint(0, 100, (1,10)))

tensor([0.5468], grad_fn=<SigmoidBackward>)

In [41]:
### Train Model

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [176]:
data = pd.read_csv('data/ml-100k-joined.csv')
used_feature = ['user_id', 'item_id', 'rating', 
                'age', 'gender', 'occupation', 'zipcode', 
                'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                'Sci-Fi', 'Thriller', 'War', 'Western']

data = data[used_feature]
target = data.pop('rating')

data.head()

Unnamed: 0,user_id,item_id,age,gender,occupation,zipcode,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,49,M,writer,55105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,39,F,executive,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,25,M,writer,40206,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,28,M,technician,80525,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,47,M,educator,55113,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
from collections import Counter

In [113]:
Counter(data['zipcode'])

Counter({'55105': 539,
         '00000': 129,
         '40206': 128,
         '80525': 678,
         '55113': 219,
         '01581': 127,
         '17110': 92,
         '22903': 385,
         '94086': 271,
         '98101': 211,
         '97214': 232,
         '15217': 288,
         '93402': 216,
         '03060': 132,
         '43512': 141,
         '14853': 484,
         '22206': 81,
         '02154': 342,
         '44106': 296,
         '94702': 480,
         '67401': 181,
         'L9G2B': 69,
         '63108': 448,
         '95076': 397,
         '10707': 278,
         '54467': 121,
         '30220': 216,
         '75240': 93,
         '66215': 120,
         '52245': 24,
         '55439': 275,
         '58202': 27,
         '93550': 150,
         '98006': 264,
         '70808': 51,
         '21218': 493,
         '30033': 23,
         '95064': 518,
         '91344': 473,
         '90703': 184,
         '92629': 68,
         'E2A4H': 386,
         '31211': 62,
         '28734': 195

In [108]:
def sparse_feat_map_process(data):
    
    pass

In [109]:
def sparse_feat_bin_process(data, bins=None, args=None):
    pass

In [None]:
def dense_feat_norm_process(data)

In [177]:
sparse_feat = ['user_id', 'item_id', 'age', 'gender', 'occupation', 'zipcode']
label_encoders = {}
for fname in sparse_feat:
    le = LabelEncoder()
    le.fit(data[fname])
    data[fname] = le.transform(data[fname])
    label_encoders[fname] = (le.classes_.tolist(), dict(zip(le.classes_, le.transform(le.classes_))))

In [178]:
label_encoders['gender']

(['F', 'M'], {'F': 0, 'M': 1})

In [179]:
data.head()

Unnamed: 0,user_id,item_id,age,gender,occupation,zipcode,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,195,241,39,1,20,415,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,185,301,29,0,6,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,21,376,15,1,20,311,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,243,50,18,1,19,591,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,165,345,37,1,3,420,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [180]:
feat_dims = []
for col in data.columns:
    if col in label_encoders:
        feat_dims.append(len(label_encoders[col][0]))
    else:
        feat_dims.append(1)
feat_dims

[943,
 1682,
 61,
 2,
 21,
 795,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [181]:
sum(feat_dims)

3523

In [182]:
data.max()

user_id         942
item_id        1681
age              60
gender            1
occupation       20
zipcode         794
unknown           1
Action            1
Adventure         1
Animation         1
Children's        1
Comedy            1
Crime             1
Documentary       1
Drama             1
Fantasy           1
Film-Noir         1
Horror            1
Musical           1
Mystery           1
Romance           1
Sci-Fi            1
Thriller          1
War               1
Western           1
dtype: int64

In [168]:
train_idx, test_idx = train_test_split(data.index, test_size=0.2)
train_data = data.iloc[train_idx.values]
test_data = data.iloc[test_idx.values]

train_target = target.iloc[train_idx.values]
test_target = target.iloc[test_idx.values]

In [125]:
torch.utils.data.TensorDataset??

[0;31mInit signature:[0m [0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mTensorDataset[0m[0;34m([0m[0;34m*[0m[0mtensors[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mTensorDataset[0m[0;34m([0m[0mDataset[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Dataset wrapping tensors.[0m
[0;34m[0m
[0;34m    Each sample will be retrieved by indexing tensors along the first dimension.[0m
[0;34m[0m
[0;34m    Arguments:[0m
[0;34m        *tensors (Tensor): tensors that have the same size of the first dimension.[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0mtensors[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32massert[0m [0mall[0m[0;34m([0m[0mtensors[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m0[0m[0;34m)[0m [0;34m==

In [131]:
from torch.utils.data import Dataset, DataLoader

In [140]:
data.values.shape

(100000, 26)

In [147]:
data['rating'].values

array([3, 3, 1, ..., 1, 2, 3])

In [None]:
data.drop

In [135]:
def CSVDataSet(Dataset):
    def __init__(self, data_df, lables):
        self.data = data_df.values
        data = data['rating']
        self.label = .values
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        

In [None]:
train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.randn((1000, 10)), torch.randint(10, (1000,))), 
    batch_size=16,
    shuffle=True)
vali_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(torch.randn((1000, 10)), torch.randint(10, (1000,))), 
    batch_size=16)

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

X, y = next(iter(train_loader))
print(X.shape, y.shape)


### Plot Model

In [27]:
torch.save(model, 'models/w&d.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [26]:
dummy_input = torch.randint(0, 100, (1,10))
torch.onnx.export(model, dummy_input, "models/w&d.onnx", verbose=True)

graph(%0 : Long(1, 10),
      %embedding.embedding.weight : Float(1000, 8),
      %linear.bias : Float(1),
      %linear.fc.weight : Float(1000, 1),
      %mlp.mlp.0.weight : Float(16, 80),
      %mlp.mlp.0.bias : Float(16),
      %mlp.mlp.3.weight : Float(16, 16),
      %mlp.mlp.3.bias : Float(16),
      %mlp.mlp.6.weight : Float(1, 16),
      %mlp.mlp.6.bias : Float(1)):
  %10 : Long(1, 10) = onnx::Constant[value=   0  100  200  300  400  500  600  700  800  900 [ Variable[CPULongType]{1,10} ]]()
  %11 : Long(1, 10) = onnx::Add(%0, %10), scope: WideAndDeepModel/FeaturesLinear[linear] # <ipython-input-18-4d85377e88f2>:19:0
  %12 : Float(1, 10, 1) = onnx::Gather(%linear.fc.weight, %11), scope: WideAndDeepModel/FeaturesLinear[linear]/Embedding[fc] # /Users/ring/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/functional.py:1484:0
  %13 : Float(1, 1) = onnx::ReduceSum[axes=[1], keepdims=0](%12), scope: WideAndDeepModel/FeaturesLinear[linear] # <ipython-input-18-4d85377e88f2>:2