In [1]:
# ref https://nbviewer.org/github/david-cortes/cmfrec/blob/master/example/cmfrec_movielens_sideinfo.ipynb
# ref https://cmfrec.readthedocs.io/en/latest/#cmf

In [2]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import torch
import itertools
import pickle

In [3]:
dataset_folder = "../dataset/"
cnn_file = "inceptionNet_cnn_emb_tensor.pkl"
word2vec_emb_file = "Word2VecAverageArray.npy"
method = cnn_file.split("_")[0]
print(method)

googleNet


In [4]:
word2vec_emb = np.load(dataset_folder + word2vec_emb_file)
word2vec_emb.shape

(5000, 100)

In [5]:
cnn_emb = torch.load(dataset_folder + cnn_file)
cnn_emb_np =cnn_emb.numpy()

In [6]:
cmf_df = pd.DataFrame(columns = ["UserId", "ItemId"])
left = []
[left.extend([i] * (word2vec_emb.shape[1] + cnn_emb_np.shape[1])) for i in list(range(0, word2vec_emb.shape[0]))]
print(len(left))
cmf_df["UserId"] = left

5500000


In [7]:
right = list(range(word2vec_emb.shape[0], (word2vec_emb.shape[0] + word2vec_emb.shape[1] + cnn_emb_np.shape[1]))) * word2vec_emb.shape[0]
len(right)
cmf_df["ItemId"] = right
cmf_df.head()

Unnamed: 0,UserId,ItemId
0,0,5000
1,0,5001
2,0,5002
3,0,5003
4,0,5004


In [8]:
emb_concat = np.concatenate([word2vec_emb, cnn_emb], axis = 1)

In [9]:
cmf_df["Rating"] = list(emb_concat.flatten())
cmf_df.head()

Unnamed: 0,UserId,ItemId,Rating
0,0,5000,-0.062224
1,0,5001,0.091295
2,0,5002,-0.135811
3,0,5003,0.040227
4,0,5004,0.474552


In [10]:
m_classic = CMF(k=50)\
                .fit(cmf_df)

Starting ALS optimization routine

Updating B ... done
Updating A ... done
	Completed ALS iteration  1

Updating B ... done
Updating A ... done
	Completed ALS iteration  2

Updating B ... done
Updating A ... done
	Completed ALS iteration  3

Updating B ... done
Updating A ... done
	Completed ALS iteration  4

Updating B ... done
Updating A ... done
	Completed ALS iteration  5

Updating B ... done
Updating A ... done
	Completed ALS iteration  6

Updating B ... done
Updating A ... done
	Completed ALS iteration  7

Updating B ... done
Updating A ... done
	Completed ALS iteration  8

Updating B ... done
Updating A ... done
	Completed ALS iteration  9

Updating B ... done
Updating A ... done
	Completed ALS iteration 10

ALS procedure terminated successfully


In [11]:
m_classic.A_.shape

(5000, 50)

In [12]:
m_classic.B_.shape

(1100, 50)

In [13]:
embeddings = {}
embeddings["E0"] = m_classic.A_
embeddings["E1"] = m_classic.B_[0:word2vec_emb.shape[1], :]
embeddings["E2"] = m_classic.B_[word2vec_emb.shape[1]:, :]
with open(f'embeddings_cmf_{method}.pkl', 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Downstream tasks

In [14]:
train_ids = torch.load("../dataset/train_book_id.pkl")
test_ids = torch.load("../dataset/test_book_id.pkl")
print(train_ids.size())
print(test_ids.size())

torch.Size([4000])
torch.Size([1000])


In [15]:
df = pd.read_csv("../dataset/books_with_genres.csv")
df['is_fiction'] = df['binary_category'].apply(lambda x: 1 if x=='fiction' else 0)
df.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,"mystery, thriller, crime",poetry,romance,non-fiction,children,young-adult,"comics, graphic",category,binary_category,is_fiction
0,,7,['189911'],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,1.0,1.0,,,,,,"fantasy, paranormal",fiction,1
1,842379428.0,566,[],US,eng,"[{'count': '6393', 'name': 'to-read'}, {'count...",,False,4.26,B000FCKCJC,...,,,,163.0,,,,non-fiction,non-fiction,0
2,590417010.0,193,[],US,eng,"[{'count': '450', 'name': 'to-read'}, {'count'...",,False,4.43,B017RORXNI,...,,,,2.0,109.0,1.0,,children,non-fiction,0
3,1400041694.0,44,[],US,en-US,"[{'count': '362', 'name': 'to-read'}, {'count'...",,False,3.75,B002OTKEP6,...,,,,37.0,,,,non-fiction,non-fiction,0
4,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,,3.0,,,,,,poetry,non-fiction,0


In [16]:
X_train = embeddings["E0"][0:4000]
X_test = embeddings["E0"][4000:]
print(X_train.shape)
print(X_test.shape)

(4000, 50)
(1000, 50)


#### Regression

In [17]:
y_train = df["average_rating"][0:4000]
y_test = df["average_rating"][4000:]
print(y_train.shape)
print(y_test.shape)

(4000,)
(1000,)


In [18]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
svr = SVR()
svr.fit(X_train, y_train)
preds = svr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

In [19]:
print(error)
print(r2)

0.19294816055738817
-0.10659926113924434


In [20]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

In [21]:
print(error)
print(r2)

0.17456905518169777
-0.0011911330161977762


In [22]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
preds = regr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)
print('error: ', error)
print('r2 score: ', r2)

error:  0.3244524811665002
r2 score:  -0.8608048653920986


In [23]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
regr = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=10, random_state=0).fit(X_train, y_train)
preds = regr.predict(X_test)
error = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)
print('error: ', error)
print('r2 score: ', r2)

error:  0.198301981
r2 score:  -0.13730457457136858


#### Classification

In [24]:
y_train = df["is_fiction"][0:4000]
y_test = df["is_fiction"][4000:]
print(y_train.shape)
print(y_test.shape)

(4000,)
(1000,)


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score, precision_score, recall_score
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
score = f1_score(y_test, preds)
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
print('f1_score: ', score)
print('accuarcy: ', accuracy)
print('precision: ', precision)
print('recall: ', recall)

f1_score:  0.7782529016493586
accuarcy:  0.637
precision:  0.637
recall:  1.0


In [26]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(X_train, y_train)
preds = clf.predict(X_test)
score = f1_score(y_test, preds)
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
print('f1_score: ', score)
print('accuarcy: ', accuracy)
print('precision: ', precision)
print('recall: ', recall)

f1_score:  0.76410998552822
accuarcy:  0.674
precision:  0.7087248322147651
recall:  0.8288854003139717


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
preds = clf.predict(X_test)
score = f1_score(y_test, preds)
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
print('f1_score: ', score)
print('accuarcy: ', accuracy)
print('precision: ', precision)
print('recall: ', recall)

f1_score:  0.7263157894736841
accuarcy:  0.636
precision:  0.696969696969697
recall:  0.7582417582417582
