# Analytial Goal: Predict the daily stock price given the number of tweets on a given date (tweets volume) -- TINA

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder \
            .appName("day6") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
            .getOrCreate()
spark.catalog.clearCache()

#GET DF FROM MONGODB

In [0]:
database = 'MSDS697'
collection = 'Tweets'
user_name = 'qliu46'
password = 'Tina0726'
address = 'msds697.us6ly.mongodb.net'#grad address from MongoDB
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df = spark.read.format("mongo").option("uri",connection_string).load().dropna(how='any')

# DATA PREPROCESSING

In [0]:
df_new = df.distinct().groupBy('Adj Close').count()

In [0]:
df_new.count()

In [0]:
df_new = df_new.withColumn("Adj Close_", df_new["Adj Close"].cast("double"))

In [0]:
df_new = df_new.withColumn("count_", df_new["count"].cast("double"))

In [0]:
df_new = df_new.drop('count').drop('Adj Close')

In [0]:
df_new = df_new.withColumn('Adj Close', df_new["Adj Close_"]* 1000)

In [0]:
df_new = df_new.drop('Adj Close_').withColumnRenamed("Adj Close", "Adj Close_")

#Create Traning and Test data

In [0]:
# Create Training and Test data.
indexes = df_new.randomSplit([0.8, 0.2], 1)
df_train = indexes[0].cache()
df_test = indexes[1].cache()

# Create dataframe with a feature vector and label

In [0]:
from pyspark.ml.feature import VectorAssembler
input_ = ['count_']
va = VectorAssembler(outputCol="features", inputCols=input_)

df_train.unpersist(blocking=False)
df_test.unpersist(blocking=False)
df_train = va.transform(df_train).select("features", "Adj Close_").withColumnRenamed("Adj Close_", "label").cache()
df_test = va.transform(df_test).select("features", "Adj Close_").withColumnRenamed("Adj Close_", "label").cache()

#Create a LinearRegression model and train the model using training Dataset

In [0]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='label')#, maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [0]:
%%time
lr_model = lr.fit(df_train)

### Evaluate the model

In [0]:
lr_predictions = lr_model.transform(df_test)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

#Create a Random Forest model and train the model using training Dataset

In [0]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="label", featuresCol="features")
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
pipeline = Pipeline(stages=[rf])
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
import numpy as np
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]).addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 25, num = 3)]).build()
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
cv = CrossValidator(estimator=pipeline, 
                    evaluator=evaluator, 
                    numFolds=5, 
                    estimatorParamMaps=paramGrid)


In [0]:
%%time
cvmodel = cv.fit(df_train)

### Evaluate the Model

In [0]:
prediction = cvmodel.bestModel.transform(df_test)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metrics = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

metrics.evaluate(prediction)