In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
# appName= "hive_pyspark"
# master= "local"
import matplotlib.pyplot as plt
from textblob import TextBlob

import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import numpy as np
from pyspark.sql.functions import udf,col, lower
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.mllib.tree import RandomForestModel, RandomForest
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col, greatest
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, StructField, StringType
from operator import itemgetter
from pyspark.sql.types import StructType

## POSTS

In [None]:
df_post_1_spark = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/df_posts_to_use.csv", header=True, inferSchema =True)
testingIdx = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/test_indices.csv", header=True, inferSchema =True) 

testingData_post = df_post_1_spark.join(testingIdx, df_post_1_spark["index"] == testingIdx["index"])
vectorAssembler = VectorAssembler(inputCols = ["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal"], outputCol = 'features')
post_final_test = vectorAssembler.transform(testingData_post)
post_final_test = post_final_test.select(['features', 'label'])


In [None]:
posts_model = RandomForestClassificationModel.load("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/ml_models/rf_posts_model")
predictions_posts = posts_model.transform(post_final_test)

evaluator = MulticlassClassificationEvaluator(
     labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions_posts)
probab_posts = predictions_posts.select('probability')

In [None]:
predictions_posts.select("prediction").toPandas().to_csv("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/posts_labels.csv")


# STOCK

In [None]:
stock_dataset = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/stock_dataset.csv", header=True, inferSchema =True)

testData_tmp = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/test_indices.csv", header=True, inferSchema =True)

testData = stock_dataset.join(testData_tmp, stock_dataset["id"] == testData_tmp["index"])

In [None]:
def parse_dataset(dataset):
    # Load and parse the data file, converting it to a DataFrame.
    columns = dataset.columns
    columns.remove("index")
    columns.remove("label")
    columns.remove("id")

    vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')
    data = vectorAssembler.transform(dataset)
    return data.select(['features', 'label'])

In [None]:
testDataStock = parse_dataset(testData)
ml_models_path = "gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/ml_models/rf_stock_model"
rf2 = RandomForestClassificationModel.load(ml_models_path)

predictions_stock = rf2.transform(testDataStock)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_stock)
probab_stock = predictions_stock.select('probability')

In [None]:
predictions_stock.select("prediction").toPandas().to_csv("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/stock_labels.csv")


In [None]:
def predict_label(probab_stock, probab_posts):
    proba_stock = probab_stock.withColumn("prob_label_s", vector_to_array("probability")).select([col("prob_label_s")[i] for i in range(3)])
    proba_posts = probab_posts.withColumn("prob_label_p", vector_to_array("probability")).select([col("prob_label_p")[i] for i in range(3)])
    proba_stock = proba_stock.select("*").withColumn("id", monotonically_increasing_id())
    proba_stock.createOrReplaceTempView('proba_stock')
    proba_stock = spark.sql('select row_number() over (order by "id") as num, * from proba_stock')
    proba_posts = proba_posts.select("*").withColumn("id", monotonically_increasing_id())
    proba_posts.createOrReplaceTempView('proba_posts')
    proba_posts = spark.sql('select row_number() over (order by "id") as num, * from proba_posts')
#     return proba_posts, proba_stock
    proba_all = proba_stock.join(proba_posts,proba_stock["num"] == proba_posts["num"])
#     return proba_all
    proba_all = proba_all.withColumn("0", col("prob_label_s[0]") +  col("prob_label_p[0]"))
    proba_all = proba_all.withColumn("1", col("prob_label_s[1]") +  col("prob_label_p[1]"))
    proba_all = proba_all.withColumn("2", col("prob_label_s[2]") +  col("prob_label_p[2]"))
    proba_all_labels = proba_all.select("0", "1", "2")
#     return proba_all_labels
    schema=StructType([StructField('maxval',FloatType()),StructField('label',StringType())])

    maxcol = F.udf(lambda row: max(row,key=itemgetter(0)), schema)
    maxDF = proba_all_labels.withColumn('maxfield', maxcol(F.struct([F.struct(proba_all_labels[x],F.lit(x)) for x in proba_all_labels.columns]))).\
    select(proba_all_labels.columns+['maxfield.maxval','maxfield.label'])

    return maxDF.select('label')

In [None]:
labels = predict_label(probab_stock, probab_posts)

In [None]:
l = labels.toPandas()

l.to_csv("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/labels.csv", index = False)

In [None]:
proba_posts.toPandas().to_csv("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/proba_posts.csv", index = False)
proba_stock.toPandas().to_csv("gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/proba_stock.csv", index = False)