In [1]:
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer
import numpy as np

In [2]:
train = [
    {"user": "1", "item": "5", "age": 19},
    {"user": "2", "item": "43", "age": 33},
    {"user": "3", "item": "20", "age": 55},
    {"user": "4", "item": "10", "age": 20},
]
v = DictVectorizer()
X = v.fit_transform(train)
print(X.toarray())

[[19.  0.  0.  0.  1.  1.  0.  0.  0.]
 [33.  0.  0.  1.  0.  0.  1.  0.  0.]
 [55.  0.  1.  0.  0.  0.  0.  1.  0.]
 [20.  1.  0.  0.  0.  0.  0.  0.  1.]]


In [3]:
y = np.repeat(1.0,X.shape[0])
y

array([1., 1., 1., 1.])

In [4]:
fm = pylibfm.FM()
fm.fit(X,y)
fm.predict(v.transform({"user": "1", "item": "10", "age": 24}))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.38094


array([0.99204031])

In [5]:
## 电影评分数据集实战

In [2]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm

# Read in data
def loadData(filename,path="ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

In [3]:
(train_data, y_train, train_users, train_items) = loadData("ua.base")
(test_data, y_test, test_users, test_items) = loadData("ua.test")
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

In [4]:
# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=10, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.59486
-- Epoch 2
Training MSE: 0.51817
-- Epoch 3
Training MSE: 0.49050
-- Epoch 4
Training MSE: 0.47471
-- Epoch 5
Training MSE: 0.46431
-- Epoch 6
Training MSE: 0.45681
-- Epoch 7
Training MSE: 0.45109
-- Epoch 8
Training MSE: 0.44651
-- Epoch 9
Training MSE: 0.44281
-- Epoch 10
Training MSE: 0.43963
-- Epoch 11
Training MSE: 0.43680
-- Epoch 12
Training MSE: 0.43445
-- Epoch 13
Training MSE: 0.43237
-- Epoch 14
Training MSE: 0.43040
-- Epoch 15
Training MSE: 0.42869
-- Epoch 16
Training MSE: 0.42710
-- Epoch 17
Training MSE: 0.42548
-- Epoch 18
Training MSE: 0.42407
-- Epoch 19
Training MSE: 0.42277
-- Epoch 20
Training MSE: 0.42143
-- Epoch 21
Training MSE: 0.42023
-- Epoch 22
Training MSE: 0.41898
-- Epoch 23
Training MSE: 0.41777
-- Epoch 24
Training MSE: 0.41652
-- Epoch 25
Training MSE: 0.41533
-- Epoch 26
Training MSE: 0.41423
-- Epoch 27
Training MSE: 0.41298
-- Epoch 28
Tra

In [5]:
preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_test,preds))

FM MSE: 0.8909


In [6]:
# 分类任务实战

In [10]:
# 搞数据
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from pyfm import pylibfm

from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000,n_features=100, n_clusters_per_class=1)
data = [ {v: k for k, v in dict(zip(i, range(len(i)))).items()}  for i in X]

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=42)

v = DictVectorizer()
X_train = v.fit_transform(X_train)
X_test = v.transform(X_test)

In [11]:
# 建模型
fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal")
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 2.12467
-- Epoch 2
Training log loss: 1.74185
-- Epoch 3
Training log loss: 1.42232
-- Epoch 4
Training log loss: 1.16085
-- Epoch 5
Training log loss: 0.94964
-- Epoch 6
Training log loss: 0.78052
-- Epoch 7
Training log loss: 0.64547
-- Epoch 8
Training log loss: 0.53758
-- Epoch 9
Training log loss: 0.45132
-- Epoch 10
Training log loss: 0.38187


In [12]:
from sklearn.metrics import log_loss
print("Validation log loss: %.4f" % log_loss(y_test,fm.predict(X_test)))

Validation log loss: 1.3678
