In [1]:
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/tensorflow/swift-models", .branch("master"))' Batcher ModelSupport Datasets

Installing packages:
	.package(url: "https://github.com/tensorflow/swift-models", .branch("master"))
		Batcher
		ModelSupport
		Datasets
With SwiftPM flags: ['-c', 'release']
Working in: /tmp/tmpg98c6xze/swift-install
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
Initializing Swift...


In [0]:
import Foundation
import TensorFlow
import Datasets

# Dataset

In [0]:
extension Sequence where Element : Collection {
    subscript(column column : Element.Index) -> [ Element.Iterator.Element ] {
        return map { $0[ column ] }
    }
}
extension Sequence where Iterator.Element: Hashable {
    func unique() -> [Iterator.Element] {
        var seen: Set<Iterator.Element> = []
        return filter { seen.insert($0).inserted }
    }
}

In [4]:
print(_ExecutionContext.global.deviceNames)

["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:XLA_CPU:0", "/job:localhost/replica:0/task:0/device:GPU:0", "/job:localhost/replica:0/task:0/device:XLA_GPU:0"]


In [0]:
public struct MovieLens {

    public let users: [Float]
    public let items: [Float]
    public let num_users: Int
    public let num_items: Int
    public let user_item_rating: [TensorPair<Int32,Float>]
    public let rating: [Float]
    public let user2id: [Float:Int]
    public let id2user: [Int:Float]
    public let item2id: [Float:Int]
    public let id2item: [Int:Float]
    public let neg_sampling: Tensor<Float>

    static func downloadMovieLensDatasetIfNotPresent() -> String{
        let localURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
        let dataFolder = DatasetUtilities.downloadResource(
            filename: "ml-100k",
            fileExtension: "zip",
            remoteRoot: URL(string: "http://files.grouplens.org/datasets/movielens/")!,
            localStorageDirectory: localURL.appendingPathComponent("data/", isDirectory: true))

        return try! String(contentsOf: dataFolder.appendingPathComponent("u1.base"), encoding: .utf8)}

    public init() {
        let dataFiles  = MovieLens.downloadMovieLensDatasetIfNotPresent()
        let data: [[Float]] = dataFiles.split(separator: "\n").map{ String($0).split(separator: "\t").compactMap{ Float(String($0)) } }

        // let data = datad[0...5000]
        let users = data[column: 0].unique()
        let items = data[column: 1].unique()
        let rating = data[column: 2]

        let user_index = 0...users.count-1
        let user2id = Dictionary(uniqueKeysWithValues: zip(users,user_index))
        let id2user = Dictionary(uniqueKeysWithValues: zip(user_index,users))

        let item_index = 0...items.count-1
        let item2id = Dictionary(uniqueKeysWithValues: zip(items,item_index))
        let id2item = Dictionary(uniqueKeysWithValues: zip(item_index,items))

        var neg_sampling = Tensor<Float>(zeros: [users.count,items.count])

        var dataset:[TensorPair<Int32,Float>] = []

        for element in data{
            let u_index = user2id[element[0]]!
            let i_index = item2id[element[1]]!
            let rating = element[2]
            if (rating > 0){
              neg_sampling[u_index][i_index] = Tensor(1.0)
            }
        }

        for element in data{
            let u_index = user2id[element[0]]!
            let i_index = item2id[element[1]]!

            let x = Tensor<Int32>([Int32(u_index), Int32(i_index)])
            dataset.append(TensorPair<Int32, Float>(first:x, second: [1]))

            for i in 0...3{
              var i_index = Int.random(in:item_index)
              while(neg_sampling[u_index][i_index].scalarized() == 1.0){
                i_index = Int.random(in:item_index)
              }
              let x = Tensor<Int32>([Int32(u_index), Int32(i_index)])
              dataset.append(TensorPair<Int32, Float>(first: x, second: [0]))
            }
        }

        self.num_users = users.count
        self.num_items = items.count
        self.users = users
        self.items = items
        self.rating = rating
        self.user2id = user2id
        self.id2user = id2user
        self.item2id = item2id
        self.id2item = id2item
        self.user_item_rating = dataset
        self.neg_sampling = neg_sampling
    }
}

# Model

In [0]:
public struct NeuMF: Module {

    public typealias Scalar = Float

    @noDerivative public let num_users: Int
    @noDerivative public let num_items: Int
    @noDerivative public let mf_dim: Int
    @noDerivative public let mf_reg: Scalar
    @noDerivative public var mlp_layer_sizes : [Int] = [64,32,16,8]
    @noDerivative public var mlp_layer_regs: [Scalar] = [0,0,0,0]

    public var mf_user_embed: Embedding<Scalar>
    public var mf_item_embed: Embedding<Scalar>
    public var mlp_user_embed: Embedding<Scalar>
    public var mlp_item_embed: Embedding<Scalar>
    public var dense1: Dense<Scalar>
    public var dense2: Dense<Scalar>
    public var dense3: Dense<Scalar>
    public var final_dense: Dense<Scalar>


    public init(
        num_users: Int,
        num_items: Int,
        mf_dim: Int,
        mf_reg: Float,
        mlp_layer_sizes: [Int],
        mlp_layer_regs: [Float]
    ) {
        self.num_users = num_users
        self.num_items = num_items
        self.mf_dim = mf_dim
        self.mf_reg = mf_reg
        self.mlp_layer_sizes = mlp_layer_sizes
        self.mlp_layer_regs = mlp_layer_regs

        // precondition(self.mlp_layer_sizes[0]%2 == 0, "u dummy, mlp_layer_sizes[0] % 2 != 0")
        // precondition(self.mlp_layer_sizes.count == self.mlp_layer_regs.count, "u dummy, layer_sizes != layer_regs!")

        //TODO: regularization
        self.mf_user_embed = Embedding<Scalar>(vocabularySize: self.num_users, embeddingSize: self.mf_dim)
        self.mf_item_embed = Embedding<Scalar>(vocabularySize: self.num_items, embeddingSize: self.mf_dim)
        self.mlp_user_embed = Embedding<Scalar>(vocabularySize: self.num_users, embeddingSize: self.mlp_layer_sizes[0]/2)
        self.mlp_item_embed = Embedding<Scalar>(vocabularySize: self.num_items, embeddingSize: self.mlp_layer_sizes[0]/2)

        //TODO: Extend it for n layers by using for loop
        //Currently only for 3 layers
        dense1 = Dense(inputSize: self.mlp_layer_sizes[0], outputSize: self.mlp_layer_sizes[1], activation: relu)
        dense2 = Dense(inputSize: self.mlp_layer_sizes[1], outputSize: self.mlp_layer_sizes[2], activation: relu)
        dense3 = Dense(inputSize: self.mlp_layer_sizes[2], outputSize: self.mlp_layer_sizes[3], activation: relu)
        final_dense = Dense(inputSize: (self.mlp_layer_sizes[3] + self.mf_dim), outputSize: 1)
    }
        @differentiable
        public func callAsFunction(_ input: Tensor<Int32>) -> Tensor<Scalar>{
            let user_indices = input.unstacked(alongAxis:1)[0]
            let item_indices = input.unstacked(alongAxis:1)[1]

            let user_embed_mlp = self.mlp_user_embed(user_indices)
            let item_embed_mlp = self.mlp_item_embed(item_indices)
            let user_embed_mf = self.mf_user_embed(user_indices)
            let item_embed_mf = self.mf_item_embed(item_indices)

            // let mf_vector = matmul(user_embed_mf,item_embed_mf)
            let mf_vector = user_embed_mf*item_embed_mf
            var mlp_vector = user_embed_mlp.concatenated(with:item_embed_mlp,alongAxis:-1)
            //
            // print(mlp_vector.shape)
            mlp_vector = mlp_vector.sequenced(through: dense1, dense2, dense3)
            let vector = mlp_vector.concatenated(with:mf_vector,alongAxis:-1)

            return final_dense(vector)
            // return mf_vector
        }
    // }
}


# Training

In [0]:
import Batcher

In [8]:
let dataset = MovieLens()

Loading resource: ml-100k


In [0]:
let num_users = dataset.num_users
let num_items = dataset.num_items

In [10]:
print("Number of datapoints", dataset.user_item_rating.count)
print("Number of users", num_users)
print("Number of items", num_items)

Number of datapoints 400000
Number of users 943
Number of items 1650


In [0]:
let batcher = Batcher(on: dataset.user_item_rating, batchSize: 1024, shuffle: true)

In [0]:
var size:[Int] = [64, 32, 16, 8]
var regs:[Float] = [0, 0, 0, 0]
var model = NeuMF(num_users: num_users, num_items: num_items, mf_dim: 8, mf_reg: 0.0, mlp_layer_sizes: size, mlp_layer_regs: regs)

In [0]:
let optimizer = Adam(for: model, learningRate: 0.005)

In [0]:
withDevice(.gpu){
  for epoch in 1...100{
    var avg_loss: Float = 0.0
    Context.local.learningPhase = .training
    for data in batcher.sequenced(){
            let user_id = data.first
            let rating = data.second
            let (loss, grad) = valueWithGradient(at: model){model -> Tensor<Float> in
            let logits = model(user_id)
            return sigmoidCrossEntropy(logits: logits, labels: rating)}

            optimizer.update(&model, along: grad)
            avg_loss = avg_loss + loss.scalarized()
    }
    print("Epoch: \(epoch)", "Current loss: \(avg_loss/1024.0)")
  }
}

Epoch: 1 Current loss: 0.14999056
Epoch: 2 Current loss: 0.13836882
Epoch: 3 Current loss: 0.13333568


## Testing the result and then verifying it on Test dataset

In [22]:
let localURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
let dataFolder = DatasetUtilities.downloadResource(
    filename: "ml-100k",
    fileExtension: "zip",
    remoteRoot: URL(string: "http://files.grouplens.org/datasets/movielens/")!,
    localStorageDirectory: localURL.appendingPathComponent("data/", isDirectory: true))

Loading resource: ml-100k


In [0]:
let test =  String(contentsOf: dataFolder.appendingPathComponent("u1.test"), encoding: .utf8)

In [0]:
let data: [[Float]] = test.split(separator: "\n").map{ String($0).split(separator: "\t").compactMap{ Float(String($0)) } }

In [0]:
let count_item = Array(repeating: 0.0, count: data[column: 0].unique().count )

In [0]:
var item_count = Dictionary(uniqueKeysWithValues: zip(data[column: 0].unique(),count_item))

In [0]:
var test_neg_sampling = Tensor<Float>(zeros: [num_users,num_items])
for element in data{
    let rating = element[2]
    if (rating > 0 && dataset.item2id[element[1]] != nil){
      let u_index = dataset.user2id[element[0]]!
      let i_index = dataset.item2id[element[1]]!
      test_neg_sampling[u_index][i_index] = Tensor(1.0)
      item_count[element[0]] = item_count[element[0]]! + 1.0
    }
}

In [64]:
var correct = 0.0
var temp_correct = 0.0
var count = 0
for user in data[column: 0].unique(){
    var negative_item: [Float] = []
    var output: [Float] = []
    let user_index = dataset.user2id[user]!
    for item in dataset.items{
        let item_index = dataset.item2id[item]!
        if dataset.neg_sampling[user_index][item_index].scalarized() == 0{
            let input =  Tensor<Int32>(shape: [1, 2],scalars: [Int32(user_index),Int32(item_index)])
            output.append(model(input).scalarized())
            negative_item.append(item)
        }
    }

    let item_score = Dictionary(uniqueKeysWithValues: zip(negative_item,output))
    let sorted_item_score = item_score.sorted {$0.1 > $1.1}
    let top_10 = sorted_item_score.prefix(min(10, Int(item_count[user]!)))

    temp_correct = 0.0
    for (key,_) in top_10{
        if(test_neg_sampling[user_index][dataset.item2id[key]!] == Tensor(1.0)){
          correct = correct + 1.0
          temp_correct = temp_correct + 1.0
        }
        count = count + 1
    }
    print("User:",user, "Accuracy:", temp_correct/Double(top_10.count))
}

User: 1.0 Accuracy: 0.6
User: 2.0 Accuracy: 0.2
User: 3.0 Accuracy: 0.6
User: 4.0 Accuracy: 0.3
User: 5.0 Accuracy: 0.6
User: 6.0 Accuracy: 0.5
User: 7.0 Accuracy: 0.7
User: 8.0 Accuracy: 0.5
User: 9.0 Accuracy: 0.0
User: 10.0 Accuracy: 0.1
User: 11.0 Accuracy: 0.2
User: 12.0 Accuracy: 0.4
User: 13.0 Accuracy: 0.7
User: 14.0 Accuracy: 0.4
User: 15.0 Accuracy: 0.4
User: 16.0 Accuracy: 0.4
User: 17.0 Accuracy: 0.2222222222222222
User: 18.0 Accuracy: 0.6
User: 19.0 Accuracy: 0.1
User: 20.0 Accuracy: 0.3
User: 21.0 Accuracy: 0.3
User: 22.0 Accuracy: 0.4
User: 23.0 Accuracy: 0.4
User: 24.0 Accuracy: 0.1
User: 25.0 Accuracy: 0.3
User: 26.0 Accuracy: 0.8
User: 27.0 Accuracy: 0.2
User: 28.0 Accuracy: 0.4
User: 29.0 Accuracy: 0.0
User: 30.0 Accuracy: 0.1
User: 31.0 Accuracy: 0.1
User: 32.0 Accuracy: 0.1
User: 33.0 Accuracy: 0.4
User: 34.0 Accuracy: 0.3
User: 35.0 Accuracy: 0.125
User: 36.0 Accuracy: 0.0
User: 37.0 Accuracy: 0.4
User: 38.0 Accuracy: 0.2
User: 39.0 Accuracy: 0.2222222222222222
Us

In [71]:
correct/Double(count)

0.3074599116895189
