In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
appName= "hive_pyspark"
master= "local"
from pyspark.sql.functions import col, asc,desc, avg
from tqdm import tqdm
from pyspark.sql import functions as F
from pyspark.sql.types import StructType
import pandas as pd

In [None]:
spark = SparkSession.builder.master(master).appName(appName).enableHiveSupport().getOrCreate()

In [None]:
spark.sql("USE mmop_tesla_project")
df_stock = spark.sql("select * from stock")
df = df_stock.groupby("t").agg(avg("p").alias("p")).orderBy(col("t").asc()) #.dropDuplicates(["t"])
count = df.count()

In [None]:
print(count)

In [None]:
# getting the list of Row objects
row_list = df.collect()

window_size = 100
new_label = 20
epsilon = 0.02
i=0

output_df = pd.DataFrame()

part = pd.DataFrame(row_list[i*window_size : (i+1)*window_size], columns=["t", "p"])
part_pivot = part["p"].T#part.pivot(index=None, columns='t', values='p')
output_df = output_df.append(part_pivot, ignore_index=True)
output_df.loc[0, "label"] = 0
output_df.loc[0, "start_timestamp"] = part["t"][0]
output_df.loc[0, "stop_timestamp"] = part["t"][window_size-1]

for i in tqdm(range(1, int(count/window_size))):
      
    part = pd.DataFrame(row_list[i*window_size : (i+1)*window_size], columns=["t", "p"])
    part_pivot = part["p"].T#part.pivot(index=None, columns='t', values='p')
    output_df = output_df.append(part_pivot, ignore_index=True)
    
    mean = output_df.iloc[i-1, 0 : window_size-1].mean()
    mean_label = output_df.iloc[i, 0 : new_label-1].mean()
    if(abs(mean-mean_label)<epsilon):
        output_df.loc[i-1, "label"] = 1 #no change
    elif(mean>mean_label):
        output_df.loc[i-1, "label"] = 0 #will decrease
    elif(mean<mean_label):
        output_df.loc[i-1, "label"] = 2 #will increase

    output_df.loc[i, "start_timestamp"] = part["t"][0]
    output_df.loc[i, "stop_timestamp"] = part["t"][window_size-1]

i = int(count/window_size)
part = pd.DataFrame(row_list[i*window_size : (i+1)*window_size], columns=["t", "p"])
part_pivot = part["p"].T#part.pivot(index=None, columns='t', values='p')

mean = output_df.iloc[i-1, 0 : window_size-1].mean()
mean_label = part_pivot.mean()
if(abs(mean-mean_label)<epsilon):
    output_df.loc[i-1, "label"] = 1 #no change
elif(mean>mean_label):
    output_df.loc[i-1, "label"] = 0 #will decrease
elif(mean<mean_label):
    output_df.loc[i-1, "label"] = 2 #will increase

In [None]:
output_df["id"] = range(0, len(output_df))

## Save dataset

#### Save dataset to use it for posts

In [None]:
tmp = spark.createDataFrame(output_df[["start_timestamp", "stop_timestamp", "label", "index"]])
tmp.write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/twitter_input.csv")

### Save stock dataset for further use

In [None]:
spark.createDataFrame(output_df.drop(["start_timestamp", "stop_timestamp"], axis=1)).write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/stock_dataset.csv")

### Prepare dataset

In [None]:
stock_dataset = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/stock_dataset.csv", header=True, inferSchema =True)

In [None]:
trainingData_tmp = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/train_indices.csv", header=True, inferSchema =True)
testData_tmp = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/test_indices.csv", header=True, inferSchema =True)

trainingData = stock_dataset.join(trainingData_tmp, stock_dataset["id"] == trainingData_tmp["index"])
testData = stock_dataset.join(testData_tmp, stock_dataset["id"] == testData_tmp["index"])

# Train model

In [None]:
from pyspark.ml import Pipeline

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.classification import OneVsRest, OneVsRestModel
from pyspark.ml.classification import LinearSVC

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString

### Parse dataset

In [None]:
def parse_dataset(dataset):
    # Load and parse the data file, converting it to a DataFrame.
    columns = dataset.columns
    columns.remove("index")
    columns.remove("label")
    columns.remove("id")

    vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')
    data = vectorAssembler.transform(dataset)
    return data.select(['features', 'label'])

In [None]:
trainingData = parse_dataset(trainingData)
testData = parse_dataset(testData)

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(trainingData)

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(trainingData)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(trainingData)

### RandomForest

In [None]:
ml_models_path = "gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/ml_models/rf_stock_model"

In [None]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

# Train model.  This also runs the indexers.
model = rf.fit(trainingData)

model.save(ml_models_path)

# Make predictions.
predictions = model.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
accuracy = evaluator.evaluate(predictions)
print("f1 = %g" % (accuracy))

In [None]:
rf2 = RandomForestClassificationModel.load(ml_models_path)

In [None]:
# Make predictions.
predictions = rf2.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
accuracy = evaluator.evaluate(predictions)
print("f1 = %g" % (accuracy))

# One vs all

### RandomForest

In [None]:
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=rf)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# ovrModel.save(ml_models_path)

# score the model on test data.
predictions = ovrModel.transform(testData)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="f1")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("f1 = %g" % (accuracy))

### LinearSVC

In [None]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

ovr = OneVsRest(classifier=lsvc)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="f1")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("f1 = %g" % (accuracy))

### GradientBoosting

In [None]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

ovr = OneVsRest(classifier=gbt)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="f1")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("f1 = %g" % (accuracy))

## Sklearn test (not used in solution)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
sklearn_data = output_df.drop(["start_timestamp", "stop_timestamp"], axis=1)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sklearn_data.drop("label",axis=1), sklearn_data["label"], test_size=0.33, random_state=42)

In [None]:
clf = GradientBoostingClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

print(accuracy_score(y_test, clf.predict(X_test)))