## Basic Data Ingestion, Aggregations, and Linear Models

In [None]:
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

### Initialize SparkContext, SQLContext, and HiveContext

In [None]:
sc <- sparkR.init(sparkJars="/usr/share/java/mysql-connector-java.jar", 
  sparkPackages="com.databricks:spark-csv_2.10:1.4.0")
sqlContext <- sparkRSQL.init(sc)
hiveContext <- sparkRHive.init(sc)

### Read Movie Ratings CSV

In [None]:
movieRatingsCsvDF <- read.df(sqlContext, 
  "/root/pipeline/datasets/movielens/ml-latest/ratings.csv", 
  "com.databricks.spark.csv", header="true") 
head(movieRatingsCsvDF)

### Read Movie Ratings From Hive

In [None]:
movieRatingsHiveDF <- sql(hiveContext, "SELECT * FROM movie_ratings")
head(results)

### Show Only Ratings == 5

In [None]:
head(filter(movieRatingsHiveDF, movieRatingsHiveDF$rating == 5))

### Aggregate and Count By UserId

In [None]:
userIdCounts <- 
  summarize(groupBy(movieRatingsHiveDF, movieRatingsHiveDF$userId), 
  count = n(movieRatingsHiveDF$userId))
head(arrange(userIdCounts, desc(userIdCounts$count)))

### Train Linear Regression Model

In [None]:
linearRegressionModel <- glm(rating ~ userId + movieId, 
  data = movieRatingsHiveDF, family = "gaussian")

### Predict Using Trained Linear Regression Model

In [None]:
predictionsDF <- predict(linearRegressionModel, movieRatingsHiveDF)

### Calculate Errors

In [None]:
errorsDF <- select(
    predictionsDF, predictionsDF$label, predictionsDF$prediction, 
    predictionsDF$userId, predictionsDF$movieId, 
    alias(predictionsDF$label - predictionsDF$prediction, "error"))
head(errorsDF)