In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
import numpy as np
import pandas as pd

In [10]:
class simpleNet(nn.Module):  # 继承nn.Module
    def __init__(self):
        super(simpleNet, self).__init__()
        self.l1 = nn.Linear(32,5)
        self.l2 = nn.Linear(5,2)
 

    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [11]:
x=np.ones((3,32))
y=np.ones(3)

In [12]:
mynet=simpleNet()

In [14]:
a=[1,2,3,"4"]
list(filter(lambda x: isinstance(x, str), a)) if len(a) else []

['4']

In [16]:
df=pd.DataFrame({
    "a":[1,2,3,4],
    "b":["1","2","3","4"],
    "c":["man","woman","woman","man"]
})
df

Unnamed: 0,a,b,c
0,1,1,man
1,2,2,woman
2,3,3,woman
3,4,4,man


In [26]:
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder  # 数值特征最好也归一化一下。

In [30]:
cal_names=['b','c']
for c in cal_names:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c]) 
df

Unnamed: 0,a,b,c
0,1,0,0
1,2,1,1
2,3,2,1
3,4,3,0


In [46]:
dataset = TensorDataset(torch.tensor(df.values).float(), torch.tensor(np.ones(4)).float())

loader = DataLoader(dataset, shuffle=False, batch_size=32) 

for x,y in loader:
    print(x,y)

tensor([[1., 0., 0.],
        [2., 1., 1.],
        [3., 2., 1.],
        [4., 3., 0.]]) tensor([1., 1., 1., 1.])


In [64]:
# 建自己的dataset:
class Mydata(Dataset):
    def __init__(self,df):
        super(Mydata, self).__init__()
        self.df=df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        return torch.tensor(self.df.iloc[index,:].values)
    def get_col(self,col_name):                                 # 返回df的某列数据
        return torch.tensor(self.df[col_name].values)
    
mydata=Mydata(df)
mydata[2]

tensor([3, 2, 1])

In [65]:
mydata.get_col('a')

tensor([1, 2, 3, 4])

In [66]:
loader = DataLoader(mydata, shuffle=False, batch_size=32)
for x in loader:  # 只轮询一遍。对应一个epoch。 用的时候需要在外边加一个epoch的for循环。把数据遍历n_epoch次
    print(x)

tensor([[1, 0, 0],
        [2, 1, 1],
        [3, 2, 1],
        [4, 3, 0]])


In [35]:
# 每个离散特征维护该域对应的C个embeeding
cal_feature_list=[]
for name in cal_names:
    cal_feature_list.append(
      {"name":name,
       "n_unique":df[name].nunique(),
       "embed_size":5
      }
    )
cal_feature_list

[{'name': 'b', 'n_unique': 4, 'embed_size': 5},
 {'name': 'c', 'n_unique': 2, 'embed_size': 5}]

In [36]:
embedding_dict = nn.ModuleDict(
    {feat['name']: nn.Embedding(feat['n_unique'], feat['embed_size'])
     for feat in cal_feature_list}
)
embedding_dict

ModuleDict(
  (b): Embedding(4, 5)
  (c): Embedding(2, 5)
)

In [71]:
embedding_dict['b'].weight                                # 离散特征b对应的C个embedding

Parameter containing:
tensor([[-0.2252,  1.9799,  0.7338,  0.7225,  1.0330],
        [-1.2191, -0.4461, -0.3038,  1.0229, -1.0642],
        [-0.9499, -0.0350,  1.2186,  1.0014, -1.5774],
        [ 1.3050,  1.5378,  0.3975,  1.2144,  0.7354]], requires_grad=True)

In [72]:
embedding_dict['c'].weight                               # 离散特征c对应的2个embedding

Parameter containing:
tensor([[ 2.8380,  0.4838, -0.0523, -0.6702, -1.1655],
        [ 0.1504, -0.4948, -0.2001, -0.0315, -0.9504]], requires_grad=True)

In [67]:
# 把样本的每个离散特征，映射为对应的embedding
sparse_embedding_list = [embedding_dict[feat['name']] (mydata.get_col(feat['name']).long()  ) for feat in cal_feature_list]
sparse_embedding_list   # 分别对应样本的第一个特征、第二个特征

[tensor([[-0.2252,  1.9799,  0.7338,  0.7225,  1.0330],
         [-1.2191, -0.4461, -0.3038,  1.0229, -1.0642],
         [-0.9499, -0.0350,  1.2186,  1.0014, -1.5774],
         [ 1.3050,  1.5378,  0.3975,  1.2144,  0.7354]],
        grad_fn=<EmbeddingBackward>),
 tensor([[ 2.8380,  0.4838, -0.0523, -0.6702, -1.1655],
         [ 0.1504, -0.4948, -0.2001, -0.0315, -0.9504],
         [ 0.1504, -0.4948, -0.2001, -0.0315, -0.9504],
         [ 2.8380,  0.4838, -0.0523, -0.6702, -1.1655]],
        grad_fn=<EmbeddingBackward>)]

In [74]:
df[["b",'c']]  # 对应每个样本的2个离散特征embedding

Unnamed: 0,b,c
0,0,0
1,1,1
2,2,1
3,3,0


In [77]:
fm_input=torch.cat(sparse_embedding_list, dim=1)  # 每个样本的每列离散特征，经过embedding后，concat到一起
fm_input

tensor([[-0.2252,  1.9799,  0.7338,  0.7225,  1.0330,  2.8380,  0.4838, -0.0523,
         -0.6702, -1.1655],
        [-1.2191, -0.4461, -0.3038,  1.0229, -1.0642,  0.1504, -0.4948, -0.2001,
         -0.0315, -0.9504],
        [-0.9499, -0.0350,  1.2186,  1.0014, -1.5774,  0.1504, -0.4948, -0.2001,
         -0.0315, -0.9504],
        [ 1.3050,  1.5378,  0.3975,  1.2144,  0.7354,  2.8380,  0.4838, -0.0523,
         -0.6702, -1.1655]], grad_fn=<CatBackward>)

In [90]:
fm_input= fm_input.detach().numpy()                # embedding后，每个位置对应的元素22相乘。捕捉类别共现信号

In [91]:
# 离散特征embedding后。按元素22交互： [x]
import tensorflow as tf
summ=tf.reduce_sum(fm_input, axis=1, keepdims=True)  # 第一项是和的平方
square_of_sum=tf.square(summ)
square_of_sum

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[32.238087 ],
       [12.506633 ],
       [ 3.4921005],
       [43.8768   ]], dtype=float32)>

In [93]:
# 第二项是平方的和
sum_of_square=tf.reduce_sum( fm_input * fm_input, axis=1, keepdims=True)
sum_of_square

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[16.197065],
       [ 5.167873],
       [ 7.091295],
       [16.340336]], dtype=float32)>

In [95]:
y_FM_order2=0.5* (square_of_sum-sum_of_square)
y_FM_order2

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[ 8.020511 ],
       [ 3.66938  ],
       [-1.7995971],
       [13.768232 ]], dtype=float32)>