In [31]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{VectorAssembler, MinMaxScaler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.expressions.Window
import scala.collection.mutable.ListBuffer

val spark = SparkSession.builder.appName("Stock Analysis").getOrCreate()
import spark.implicits._

// Load the dataset
val filePath = "updated_data_with_close_open.csv"
val data = spark.read.option("header", "true").option("inferSchema", "true").csv(filePath)

// Convert 'Date' to DateType and sort by date
val dataWithDate = data.withColumn("Date", to_date($"Date")).orderBy("Date")

// Select unique stocks
val stocks = dataWithDate.select("Stock").distinct().as[String].collect()

// Prepare data for Linear Regression
val featureCols = Array("avg_articles_count", "avg_subjectivity", "Momentum")
val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")
val scaler = new MinMaxScaler().setInputCol("features").setOutputCol("scaledFeatures")

// Collection to store metrics for each stock
val stockMetrics = ListBuffer[(String, Double, Double, Double)]() // Stock, MSE, RMSE, R2

stocks.foreach { stock =>
  val stockData = dataWithDate.filter($"Stock" === stock)
  val assembledData = assembler.transform(stockData)
  val scaledData = scaler.fit(assembledData).transform(assembledData)

  // Add a row index column
  val windowSpec = Window.orderBy("Date")
  val dataWithIndex = scaledData.withColumn("index", row_number().over(windowSpec))

  // Calculate the split index for an 80-20 split
  val totalRows = dataWithIndex.count()
  val splitIndex = (totalRows * 0.8).toInt

  // Split the data into training and test sets based on the index
  val trainingData = dataWithIndex.filter($"index" <= splitIndex).drop("index")
  val testData = dataWithIndex.filter($"index" > splitIndex).drop("index")

  // Linear Regression
  val lr = new LinearRegression().setFeaturesCol("scaledFeatures").setLabelCol("Close-Open")
  val model = lr.fit(trainingData)

  // Make predictions
  val predictions = model.transform(testData)

  // Evaluate the model
  val evaluator = new RegressionEvaluator().setLabelCol("Close-Open")
  evaluator.setMetricName("mse")
  val mse = evaluator.evaluate(predictions)
  val rmse = math.sqrt(mse)
  evaluator.setMetricName("r2")
  val r2 = evaluator.evaluate(predictions)

  // Store metrics
  stockMetrics += ((stock, mse, rmse, r2))
}

// Sort metrics by R2 and print
val sortedMetrics = stockMetrics.sortBy(_._4)(Ordering[Double].reverse)
sortedMetrics.foreach { case (stock, mse, rmse, r2) =>
  println(s"Stock: $stock, MSE: $mse, RMSE: $rmse, R2: $r2")
}

spark.stop()


Stock: DLR, MSE: 0.12994894251402278, RMSE: 0.36048431659924235, R2: 0.9106539832053797
Stock: O, MSE: 0.015609941406060803, RMSE: 0.12493975110452558, R2: 0.9012438883141239
Stock: LBRDK, MSE: 0.23019201462180486, RMSE: 0.479783299648711, R2: 0.8820932455817201
Stock: SPG, MSE: 0.2469118230498271, RMSE: 0.49690222685134655, R2: 0.8625024337043639
Stock: AMT, MSE: 0.2515872186245875, RMSE: 0.5015847073272743, R2: 0.8508008469755741
Stock: AVB, MSE: 0.24586020155772847, RMSE: 0.4958429202456444, R2: 0.8491493230502711
Stock: JNJ, MSE: 0.15370409730733386, RMSE: 0.39205114118866413, R2: 0.8487700014530455
Stock: EQIX, MSE: 3.58634547393603, RMSE: 1.8937648940499532, R2: 0.8378349682302924
Stock: WELL, MSE: 0.047748813467035564, RMSE: 0.21851501885919777, R2: 0.8308587049881202
Stock: PLD, MSE: 0.048643700151851585, RMSE: 0.22055316853732024, R2: 0.830254577788712
Stock: SPGI, MSE: 0.5568656604292322, RMSE: 0.7462343200558603, R2: 0.8181494947723809
Stock: CVX, MSE: 0.14457457938427898, R

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{VectorAssembler, MinMaxScaler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.expressions.Window
import scala.collection.mutable.ListBuffer
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@a9264c5
import spark.implicits._
filePath: String = updated_data_with_close_open.csv
data: org.apache.spark.sql.DataFrame = [Date: date, Category: string ... 14 more fields]
dataWithDate: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Date: date, Category: string ... 14 more fields]
stocks: Array[String] = Array(AXP, AAPL, BMY, BP, AMGN, ABBV, AMT, AVB, C,...
