In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
# appName= "hive_pyspark"
# master= "local"
import matplotlib.pyplot as plt
from textblob import TextBlob

import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import numpy as np
from pyspark.sql.functions import udf,col, lower
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.mllib.tree import RandomForestModel, RandomForest
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import OneVsRest, OneVsRestModel
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.classification import OneVsRest, OneVsRestModel
from pyspark.ml.classification import LinearSVC

In [None]:
basePath = 'hdfs://cluster-a0d6-m/user/mmop/posts/'
paths = ['hdfs://cluster-a0d6-m/user/mmop/posts/*']
parquetFile = spark.read.option("basePath", basePath).parquet(*paths)
df = parquetFile.toPandas()
stocks = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/twitter_input.csv", header=True, inferSchema =True)

In [None]:
def calculate_sentiment(sentence):
    blob_text = TextBlob(sentence)
    sentiment = blob_text.sentiment.polarity
    return sentiment

 
def apply_sentiment(df):
    df["sentiment"] = df.progress_apply(lambda x: calculate_sentiment(x["text"]) , axis = 1)

    df= df.drop_duplicates()
    X = df[["id", "created_at", "sentiment"]]
    return X

X = apply_sentiment(df)
X.to_csv("df_sentiment.csv", index = False)

In [None]:
# df_sentiment = pd.read_csv("df_sentiment.csv")

In [None]:
def features_extraction(df):
    
    nr_neg = len(df[df["sentiment"] < -0.2])
    nr_pos = len(df[df["sentiment"] > 0.2])
    nr_neutral = len(df[(df["sentiment"] >= -0.2) & (df["sentiment"] <= 0.2)])
    mean_neg = np.mean(df[df["sentiment"] < -0.2]["sentiment"])
    mean_pos = np.mean(df[df["sentiment"] > 0.2]["sentiment"])
    mean_neutral = np.mean(df[(df["sentiment"] >= -0.2) & (df["sentiment"] <= 0.2)]["sentiment"])
    res_dict = {
        "nr_neg" : nr_neg,
        "nr_pos" : nr_pos,
        "nr_neutral" : nr_neutral, 
        "mean_neg" : mean_neg,
        "mean_pos" : mean_pos,
        "mean_neutal" : mean_neutral
    }
    return res_dict

In [None]:
def create_post_dataset(stock, X):
    
    X = X.drop_duplicates()
    X = X[["id", "created_at", "sentiment"]]
    X = X.drop([1301566,1298190])
    stock = stock.toPandas()
    stock["start_timestamp"] = pd.to_datetime(stock["start_timestamp"])
    stock["stop_timestamp"] = pd.to_datetime(stock["stop_timestamp"])
    X["created_at"] = pd.to_datetime(X["created_at"])
    
    df_post = pd.DataFrame(columns = ["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal", "start", "stop", "label", "index"])
    for i in tqdm(range(len(stock))):
        start = stock.loc[i, "start_timestamp"]
        stop = stock.loc[i, "stop_timestamp"]
        label = stock.loc[i, "label"]
        index = stock.loc[i, "index"]

        X_sub = X[(X['created_at'] <= stop) & (X['created_at'] >= start)]
        fe = features_extraction(X_sub)
        fe["start"] = start
        fe["stop"] = stop
        fe["label"] = label
        fe["index"] = index
        df_post = df_post.append(fe, ignore_index = True)
    return df_post


In [None]:
df_post_1 = create_post_dataset(stocks, df_sentiment).dropna()
# spark.createDataFrame(df_post_1[["index"]]).write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/indices_to_use.csv")
df_posts_to_use = df_post_1[["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal", "label", "index"]]
# spark.createDataFrame(df_posts_to_use).write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/df_posts_to_use.csv")
df_post_1_spark = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/df_posts_to_use.csv", header=True, inferSchema =True) 

In [None]:
vectorAssembler = VectorAssembler(inputCols = ["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal","label", "index"], outputCol = 'features')
post_final = vectorAssembler.transform(df_post_1_spark)
post_final = post_final.select(['features', 'label'])


In [None]:
(trainingData, testData) = df_post_1_spark.randomSplit([0.7, 0.3], seed = 42)

In [None]:
# trainingData.select('index').write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/train_indices.csv")
# testData.select('index').write.option("header","true").csv("hdfs://cluster-a0d6-m/user/mmop/test_indices.csv")


In [None]:
trainingIdx = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/train_indices.csv", header=True, inferSchema =True) 
testingIdx = spark.read.csv("hdfs://cluster-a0d6-m/user/mmop/test_indices.csv", header=True, inferSchema =True) 
trainingData = df_post_1_spark.join(trainingIdx, df_post_1_spark["index"] == trainingIdx["index"])
testingData = df_post_1_spark.join(testingIdx, df_post_1_spark["index"] == testingIdx["index"])

In [None]:
vectorAssembler = VectorAssembler(inputCols = ["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal"], outputCol = 'features')
post_final_train = vectorAssembler.transform(trainingData)
post_final_train = post_final_train.select(['features', 'label'])

vectorAssembler = VectorAssembler(inputCols = ["nr_neg", "nr_pos", "nr_neutral", "mean_neg", "mean_pos", "mean_neutal"], outputCol = 'features')
post_final_test = vectorAssembler.transform(testingData)
post_final_test = post_final_test.select(['features', 'label'])


ONE vs ALL

In [None]:
# instantiate the One Vs Rest Classifier.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
ovr = OneVsRest(classifier=rf)

# train the multiclass model.
ovrModel = ovr.fit(post_final_train)

# ovrModel.save(ml_models_path)

# score the model on test data.
predictions = ovrModel.transform(post_final_test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("acc = %g" % (accuracy))

In [None]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

ovr = OneVsRest(classifier=lsvc)

# train the multiclass model.
ovrModel = ovr.fit(post_final_train)

# score the model on test data.
predictions = ovrModel.transform(post_final_test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("acc = %g" % (accuracy))

In [None]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

ovr = OneVsRest(classifier=gbt)

# train the multiclass model.
ovrModel = ovr.fit(post_final_train)

# score the model on test data.
predictions = ovrModel.transform(post_final_test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("acc = %g" % (accuracy))

### Random Forest

In [None]:
# (trainingData, testData) = post_final.randomSplit([0.7, 0.3], seed = 42)

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
model = rf.fit(post_final_train)

predictions = model.transform(post_final_test)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")


accuracy = evaluator.evaluate(predictions)

print(accuracy)
# predictions.select(col('probability')).show()

In [None]:
ml_models_path = "gs://dataproc-staging-europe-west4-375495060785-ncrgfyir/notebooks/jupyter/ml_models/rf__posts_model"

In [None]:
model.save(ml_models_path)