https://www.oreilly.com/catalog/errata.csp?isbn=0636920035091
https://github.com/sryza/aas

In [1]:
%%init_spark
launcher.driver_memory = '4g'
// http://localhost:8888/notebooks/Downloads/backUp/Notebooks/AdvancedAnalyticswithSpark/PracNotes/Ch3.ipynb

Intitializing Scala interpreter ...

Spark Web UI available at http://172.16.8.92:4041
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1582769049048)
SparkSession available as 'spark'


res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@79353f55


In [3]:
val datapath_user = "/Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/user_artist_data.txt"
val rawUserArtistData = spark.read.textFile(datapath_user)

datapath_user: String = /Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/user_artist_data.txt
rawUserArtistData: org.apache.spark.sql.Dataset[String] = [value: string]


In [4]:
rawUserArtistData.take(5).foreach(println)

1000002 1 55
1000002 1000006 33
1000002 1000007 8
1000002 1000009 144
1000002 1000010 314


In [5]:
val userArtistDF = rawUserArtistData.map{line => 
        val Array(user, artist, _*)=line.split(" ")
        (user.toInt,artist.toInt)
}.toDF("user","artist")

userArtistDF: org.apache.spark.sql.DataFrame = [user: int, artist: int]


Spark MLlib 的 ALS 算法实现并不严格要求用户和产品的 ID 必须是数值型，不过当 ID 为 32 位非负整数时，效率会更高。使用 Int 表示 ID 是有好处的，但同时意味着 ID 不能超过 Int 的最大值（Int.MaxValue），即 2147483647。我们的数据集是否已经满足了这个要求。
最大的用户 ID 和艺术家 ID 分别是 2443548 和 10794401，而它们的最小值分别是 90 和 1，并没有出现负值。这些远比 2147483647 要小，所以在使用这些 ID 之前，没有必要进行额外的转换。

In [6]:
userArtistDF.agg(
min("user"),max("user"),min("artist"),max("artist")).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [7]:
//建立映射
val datapath_artist = "/Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/artist_data.txt"
val rawArtistData = spark.read.textFile(datapath_artist)

datapath_artist: String = /Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/artist_data.txt
rawArtistData: org.apache.spark.sql.Dataset[String] = [value: string]


In [8]:
rawArtistData.take(5).foreach(println)

1134999	06Crazy Life
6821360	Pang Nakarin
10113088	Terfel, Bartoli- Mozart: Don
10151459	The Flaming Sidebur
6826647	Bodenstandig 3000


In [9]:
val artistByID = rawArtistData.flatMap{line => 
    val (id,name) = line.span(_!='\t')
    if (name.isEmpty){
        None} 
    else {
    try{
        Some((id.toInt,name.trim))
    } catch {
        case _: NumberFormatException => None
    }
    }
}toDF("id","name")

artistByID: org.apache.spark.sql.DataFrame = [id: int, name: string]


In [10]:
artistByID.head(2)

res4: Array[org.apache.spark.sql.Row] = Array([1134999,06Crazy Life], [6821360,Pang Nakarin])


In [11]:
val datapath_alias = "/Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/artist_alias.txt"
val rawArtistAlias = spark.read.textFile(datapath_alias)

datapath_alias: String = /Users/soda/Downloads/dataSet/aas/profiledata_06-May-2005/artist_alias.txt
rawArtistAlias: org.apache.spark.sql.Dataset[String] = [value: string]


In [13]:
rawArtistAlias.take(5).foreach(println)

1092764	1000311
1095122	1000557
6708070	1007267
10088054	1042317
1195917	1042317


In [14]:
val artistAlias = rawArtistAlias.flatMap { line =>
  val Array(artist, alias) = line.split('\t')
  if (artist.isEmpty) {
    None
  } else {
    Some((artist.toInt, alias.toInt))
  }
}.collect().toMap

artistAlias: scala.collection.immutable.Map[Int,Int] = Map(1208690 -> 1003926, 2012757 -> 4569, 6949139 -> 1085752, 1109727 -> 1239120, 6772751 -> 1244705, 2070533 -> 1021544, 1157679 -> 2194, 9969617 -> 5630, 2034496 -> 1116214, 6764342 -> 40, 1272489 -> 1278238, 2108744 -> 1009267, 10349857 -> 1000052, 2145319 -> 1020463, 2126338 -> 2717, 10165456 -> 1001169, 6779368 -> 1239506, 10278137 -> 1001523, 9939075 -> 1329390, 2037201 -> 1274155, 1248585 -> 2885, 1106945 -> 1399, 6811322 -> 1019016, 9978396 -> 1784, 6676961 -> 1086433, 2117821 -> 2611, 6863616 -> 1277013, 6895480 -> 1000993, 6831632 -> 1246136, 1001719 -> 1009727, 10135633 -> 4250, 7029291 -> 1034635, 6967939 -> 1002734, 6864694 -> 1017311, 1237279 -> 1029752, 6793956 -> 1283231, 1208609 -> 1000699, 6693428 -> 1100258, 685174...

In [15]:
artistByID.filter($"id" isin (1208690,1003926)).show()

+-------+----------------+
|     id|            name|
+-------+----------------+
|1208690|Collective Souls|
|1003926| Collective Soul|
+-------+----------------+



In [16]:
// https://www.jianshu.com/p/c7b240cabec7
artistAlias get 1208690

res7: Option[Int] = Some(1003926)


3.4　构建第一个模型

In [17]:
import org.apache.spark.sql._
import org.apache.spark.broadcast._

def buildCounts(
    rawUserArtistData: Dataset[String],
    bArtistAlias: Broadcast[Map[Int,Int]]): DataFrame = {
    rawUserArtistData.map{line =>
        val Array(userID,artistID,count) = line.split(" ").map(_.toInt)
        val finalArtistID = 
                bArtistAlias.value.getOrElse(artistID,artistID)
        (userID, finalArtistID ,count)
    }.toDF("user","artist","count")
}

import org.apache.spark.sql._
import org.apache.spark.broadcast._
buildCounts: (rawUserArtistData: org.apache.spark.sql.Dataset[String], bArtistAlias: org.apache.spark.broadcast.Broadcast[Map[Int,Int]])org.apache.spark.sql.DataFrame


In [18]:
val bArtistAlias = spark.sparkContext.broadcast(artistAlias)
val trainData = buildCounts(rawUserArtistData,bArtistAlias)

bArtistAlias: org.apache.spark.broadcast.Broadcast[scala.collection.immutable.Map[Int,Int]] = Broadcast(17)
trainData: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 1 more field]


In [None]:
import org.apache.spark.ml.recommendation._
import scala.util.Random
val model = new ALS().
    setSeed(Random.nextLong()). 
    setImplicitPrefs(true).
    setRank(10).
    setRegParam(0.01).
    setAlpha(1.0).
    setMaxIter(5).
    setUserCol("user").
    setItemCol("artist").
    setRatingCol("count").
    setPredictionCol("prediction").
    fit(trainData)

In [None]:
model