## Basic Data Ingestion, Aggregations, and Linear Models

In [1]:
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))


Attaching package: 'SparkR'

The following objects are masked from 'package:stats':

    cov, filter, lag, na.omit, predict, sd, var

The following objects are masked from 'package:base':

    colnames, colnames<-, intersect, rank, rbind, sample, subset,
    summary, table, transform



### Initialize SparkContext, SQLContext, and HiveContext

In [2]:
sc <- sparkR.init(sparkJars="/usr/share/java/mysql-connector-java.jar", 
  sparkPackages="com.databricks:spark-csv_2.10:1.4.0")
sqlContext <- sparkRSQL.init(sc)
hiveContext <- sparkRHive.init(sc)

Launching java with spark-submit command /root/spark-1.6.1-bin-fluxcapacitor/bin/spark-submit --jars /usr/share/java/mysql-connector-java-5.1.28.jar --packages com.databricks:spark-csv_2.10:1.4.0 sparkr-shell /tmp/RtmpScARtp/backend_port4b7c16ad5660 


### Read Movie Ratings CSV

In [3]:
movieRatingsCsvDF <- read.df(sqlContext, 
  "/root/pipeline/datasets/movielens/ml-latest/ratings.csv", 
  "com.databricks.spark.csv", header="true") 
head(movieRatingsCsvDF)

Unnamed: 0,userId,movieId,rating,timestamp
1,1,50,4.0,1329753504
2,1,296,4.0,1329753602
3,1,318,4.5,1329753494
4,1,527,4.5,1329753507
5,1,541,3.0,1329753607
6,1,608,4.0,1329753638


### Read Movie Ratings From Hive

In [4]:
movieRatingsHiveDF <- sql(hiveContext, "SELECT * FROM movie_ratings")
head(results)

ERROR: Error in head(results): object 'results' not found


### Show Only Ratings == 5

In [5]:
head(filter(movieRatingsHiveDF, movieRatingsHiveDF$rating == 5))

Unnamed: 0,userId,movieId,rating,timestamp
1,1,3897,5,1329753716
2,1,4783,5,1329754027
3,1,4979,5,1329753751
4,1,4995,5,1329753888
5,1,6380,5,1329753988
6,1,7361,5,1329753448


### Aggregate and Count By UserId

In [6]:
userIdCounts <- 
  summarize(groupBy(movieRatingsHiveDF, movieRatingsHiveDF$userId), 
  count = n(movieRatingsHiveDF$userId))
head(arrange(userIdCounts, desc(userIdCounts$count)))

Unnamed: 0,userId,count
1,92302,9269
2,142788,7515
3,8452,6779
4,165005,5679
5,221498,5644
6,216632,5601


### Train Linear Regression Model

In [7]:
linearRegressionModel <- glm(rating ~ userId + movieId, 
  data = movieRatingsHiveDF, family = "gaussian")

### Predict Using Trained Linear Regression Model

In [8]:
predictionsDF <- predict(linearRegressionModel, movieRatingsHiveDF)

### Calculate Errors

In [9]:
errorsDF <- select(
    predictionsDF, predictionsDF$label, predictionsDF$prediction, 
    predictionsDF$userId, predictionsDF$movieId, 
    alias(predictionsDF$label - predictionsDF$prediction, "error"))
head(errorsDF)

Unnamed: 0,label,prediction,userId,movieId,error
1,4.0,3.522174,1,50,0.4778256
2,4.0,3.522209,1,296,0.4777909
3,4.5,3.522212,1,318,0.9777878
4,4.5,3.522242,1,527,0.9777584
5,3.0,3.522244,1,541,-0.5222436
6,4.0,3.522253,1,608,0.4777469
