In [1]:
case class OrderLine (
    sku : String,
    productName : String,
    thumbnailImage: String,
    quantity : Double,
    unitPrice : Double,
    totalPrice : Double
)

case class Order (
    customerId : java.util.UUID ,
    orderId : java.util.UUID ,
    date : java.util.Date ,
    OrderLines_ : List[OrderLine] ,
    totalPrice : Double
)

case class RecommendedProduct (
    sku : String,
    product_name : String,
    regular_price : Double,
    thumbnail_image : String
);

case class ProductRecommendations (
    sku : String,
    product_name : String,
    recommended_products : List[RecommendedProduct]
);

case class Top50SellingProducts (
    sku : String,
    productName : String,
    saleCount : Double,
    saleValue : Double,
    thumbnailImage : String
);



In [2]:
val orders = sc.cassandraTable[Order]("retail_ks","orders").cache

In [3]:
orders.count

Long = 2106

In [4]:
val orderlines = orders.flatMap ( order => order.OrderLines_).map( ol => (ol.sku, (ol.productName, ol.thumbnailImage, ol.quantity, ol. unitPrice, ol.totalPrice)))

In [5]:
orderlines.take(10)

Array[(String, (String, String, Double, Double, Double))] = Array((4204502,(Insignia™ - 40" Class (40" Diag.) - LED - 1080p - Smart - HDTV Roku TV - Black,http://images.bestbuy.com/BestBuy_US/images/pac/products/1313/1313521268/1313521268_s.gif,73.0,329.99,24089.27)), (1654884,(CorLiving - Full-Motion TV Wall Mount for Most 23" - 42" Flat-Panel TVs - Black,http://images.bestbuy.com/BestBuy_US/images/products/1654/1654884_s.gif,9.0,39.99,359.91)), (5957224,(Cambridge Audio - Topaz CD10 CD Player - Black,http://images.bestbuy.com/BestBuy_US/images/products/5957/5957224_s.gif,9.0,349.99,3149.91)), (2750462,(BIC America - 5-1/4" 2-Way Center-Channel Speaker - Black,http://images.bestbuy.com/BestBuy_US/images/products/2750/2750462_s.gif,9.0,89.99,809.91)), (5009300,(Bose® - Solo Soundb...

In [6]:
val soldproducts = orderlines.reduceByKey( (a,b) => (a._1, a._2, a._3+b._3, a._4, a._5+b._5)).
    map( { case ( sku, (productName, thumbnailImage, count, unitPrice, value) ) => Top50SellingProducts (sku, productName, count, value, thumbnailImage) })

In [7]:
val Top50CountSellingProducts = soldproducts.sortBy(  -_.saleValue  ).
    zipWithIndex.
    filter{case (_, idx) => idx < 50}.
    keys

In [8]:
Top50CountSellingProducts.take(1)

Array[Top50SellingProducts] = Array(Top50SellingProducts(7739048,Samsung - 78" Class (78" Diag.) - LED - Curved - 2160p - Smart - 3D - 4K Ultra HD TV - Black,1190.0,1.18999762E7,http://images.bestbuy.com/BestBuy_US/images/products/7739/7739048_s.gif))

In [9]:
Top50CountSellingProducts.saveToCassandra("retail_ks","top50_selling_products" )

In [52]:
val ProductCoOccurance = orders.
    flatMap( order => order.OrderLines_.
        map(  ol => ( ol.sku , order.OrderLines_.
            map(ol => (ol.sku, (ol.productName, ol.thumbnailImage, ol.quantity, ol. unitPrice, ol.totalPrice))).filter ( ol3 => ol3._1!=`ol`.sku) 
                ) // for each order, make a list of product cooccurance (product1, product2)
            )
            ).reduceByKey(_++_). // merge the list per product1
                mapValues { x =>     
                // for each product1, merge list on key product2
               val groupedBySku = x.groupBy( { case (sku,(pn, ti, q, up, tp)) => sku } ).values.toList
               // aggregate value sum of product2
               val TotalSumBySku = groupedBySku.map( listOfProduct => listOfProduct.reduce( (a,b) => (a._1, (a._2._1, a._2._2, a._2._3+b._2._3, a._2._4, a._2._5+b._2._5))) )
               // take top50 product2 sorted on summed value
               val Top50Value = TotalSumBySku.sortBy(-_._2._5).slice(0,50)
               Top50Value.map( lop => RecommendedProduct(lop._1, lop._2._1, lop._2._4  , lop._2._2  )) 
               // merge
                        }.
            map( pco => ProductRecommendations(pco._1, "", pco._2))

In [11]:
val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
import sqlContext.implicits._

In [12]:
val pcodf = ProductCoOccurance.toDF
pcodf.printSchema

root
 |-- sku: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- recommended_products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- sku: string (nullable = true)
 |    |    |-- product_name: string (nullable = true)
 |    |    |-- regular_price: double (nullable = false)
 |    |    |-- thumbnail_image: string (nullable = true)



In [13]:
pcodf.write.format("org.apache.spark.sql.cassandra").
    options(Map( "table" -> "product_recommendations", "keyspace" -> "retail_ks")).
    mode("overwrite").
    save()


In [None]:
ProductCoOccurance.collect.foreach(println)

In [None]:
ProductCoOccurance.saveToCassandra("retail_ks","product_recommendations")

In [None]:
sc.cassandraTable[productRecommendations]("retail_ks","product_recommendations").collect

In [53]:
val pcodf_read = sqlContext.read.format("org.apache.spark.sql.cassandra").
	options(Map( "table" -> "product_recommendations", "keyspace" -> "retail_ks")).
	load()


In [54]:
pcodf_read.count

Long = 301

In [23]:
ProductCoOccurance.take(1)

Array[ProductRecommendations] = Array(ProductRecommendations(9300273,,List(RecommendedProduct(4920300,LG - 65" Class (64.5" Diag.) - OLED - 2160p - Smart - 3D - 4K Ultra HD TV - Black,7999.98,http://images.bestbuy.com/BestBuy_US/images/products/4920/4920300_s.gif), RecommendedProduct(5034600,Samsung - 65" Class - (64.5" Diag.) - LED - Curved - 4K SUHD (2160p) - Smart - 4K Ultra HD TV - Black,3399.99,http://images.bestbuy.com/BestBuy_US/images/products/5034/5034600_s.gif), RecommendedProduct(3429111,LG - 65" Class (64.5" Diag.) - LED - 2160p - Smart - 3D - 4K Ultra HD TV - Black,2499.99,http://images.bestbuy.com/BestBuy_US/images/products/3429/3429111_s.gif), RecommendedProduct(5182036,Samsung - 75" Class (74.5" Diag.) - LED - 2160p - Smart - 4K Ultra HD TV - Black,3999.99,http://...

In [32]:
ProductCoOccurance.take(1).foreach(println(_))

ProductRecommendations(9300273,,List(RecommendedProduct(4920300,LG - 65" Class (64.5" Diag.) - OLED - 2160p - Smart - 3D - 4K Ultra HD TV - Black,7999.98,http://images.bestbuy.com/BestBuy_US/images/products/4920/4920300_s.gif), RecommendedProduct(5034600,Samsung - 65" Class - (64.5" Diag.) - LED - Curved - 4K SUHD (2160p) - Smart - 4K Ultra HD TV - Black,3399.99,http://images.bestbuy.com/BestBuy_US/images/products/5034/5034600_s.gif), RecommendedProduct(3429111,LG - 65" Class (64.5" Diag.) - LED - 2160p - Smart - 3D - 4K Ultra HD TV - Black,2499.99,http://images.bestbuy.com/BestBuy_US/images/products/3429/3429111_s.gif), RecommendedProduct(5182036,Samsung - 75" Class (74.5" Diag.) - LED - 2160p - Smart - 4K Ultra HD TV - Black,3999.99,http://images.bestbuy.com/BestBuy_US/images/pac/products/1312/1312458046/1312458046_s.gif), RecommendedProduct(5035005,Samsung - 55" Class - (54.6" Diag.) - LED - Curved - 4K SUHD (2160p) - Smart - 4K Ultra HD TV - Black,2999.98,http://images.bestbuy.com/

In [48]:
ProductCoOccurance.map(_.recommended_products.size).collect.foreach(println)

50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
2
50
50
50
50
50
50
50
48
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
3
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
2
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
3
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
50
45
50
50
50
50
50
50
50
50
47
50
50
50
50
50
50
41
50
