In [0]:
# Read data from Mongodb

In [0]:
%pip install vaderSentiment
%pip install pysparkling
%pip install h2o
%pip install h2o-pysparkling-3.2

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer, StopWordsRemover
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pysparkling import *
import h2o

In [0]:

spark = SparkSession.builder.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
                            .config("spark.jars.packages", "ai.h2o:sparkling-water-package_2.12:3.36.0.3-1-3.2")\
                            .config("spark.network.timeout", "36000s")\
                            .config("spark.executor.heartbeatInterval", "3600s")\
                            .config("spark.ext.h2o.fail.on.unsupported.spark.param", "false")\
                            .config("spark.dynamicAllocation.enabled", "false")\
                            .getOrCreate()

In [0]:
database = 'MSDS697'
collection = 'Tweets'
user_name = 'qliu46'
password = 'Tina0726'
address = 'msds697.us6ly.mongodb.net'#grad address from MongoDB
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df = spark.read.format("mongo").option("uri",connection_string).load().dropna(how='any')

## Create Unique ID

In [0]:
df = df.select("*").withColumn("id", monotonically_increasing_id()).drop('_id')
df.show(3)

## Clean tweet text

In [0]:
tweet = df.select('id','text')
tweet = tweet.distinct()

### 1. Remove return handles, handles, url, special characters, numbers and punctuations

In [0]:
def cleanTweet(txt):
    pattern = r'RT @[\w]*:'
    txt = re.sub(pattern, '', txt)
    
    pattern = r'@[\w]*'
    txt = re.sub(pattern, '', txt)
    
    pattern = r'https?://[A-Za-z0-9./]*'
    txt = re.sub(pattern, '', txt)
    
    pattern = r'[^A-Za-z]+'
    txt = re.sub(pattern, ' ', txt)
    return txt.strip()
udf_tweet_clean = udf(cleanTweet, StringType())

In [0]:
# Clean text
tweet_clean = tweet.withColumn('clean_text', udf_tweet_clean('text'))
tweet_clean.show(3)

### 2. Tokenize

In [0]:
# Tokenize text
tokenizer = Tokenizer(inputCol='clean_text', outputCol='words_token')
words_token = tokenizer.transform(tweet_clean)
words_token.show(3)

### 3. Remove stop words

In [0]:
# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_no_stop')
words_no_stopw = remover.transform(words_token)
words_no_stopw.show(3)

### 4. Convert words list back to string

In [0]:
def words_to_text(words):
    return ' '.join(words)
udf_words_to_text = udf(words_to_text, StringType())

In [0]:
new_tweet = words_no_stopw.withColumn('new_text', udf_words_to_text('words_no_stop'))
new_tweet.show(3)

## Compute Sentiment Scores of the string

In [0]:
# sentiment score
def ss(text):
    sia = SentimentIntensityAnalyzer()
    ps = sia.polarity_scores(text)
    score = ps['compound']
    return score
udf_ss = udf(ss, FloatType())

In [0]:
sentiment = new_tweet.withColumn('sentiment_score', udf_ss('new_text'))
sentiment = sentiment.select('id','sentiment_score')
sentiment.show(5)

## Join all the features

In [0]:
df_new = df.join(sentiment, sentiment.id == df.id).select(['Adj Close','like_count','quote_count','reply_count','retweet_count','sentiment_score'])
df_new.show(5)

In [0]:
df_new = df_new.withColumn('Adj Close(1000)', df_new['Adj Close']*1000)

In [0]:
df_new.show(5)

In [0]:
df_new.printSchema()

# Random Forest Regressor - Summer

### Step 1: Sentiment score computation - Continued from the data cleanning process above

In [0]:
df = df_new.drop('Adj Close')

In [0]:
df.show(5)

In [0]:
# smooth and then take log to compress the data, making them dense
from pyspark.sql.functions import log
log_df = df
for col in df.columns[:-1]:
    log_df = log_df.withColumn("log_"+col, log(df[col]+1)).drop(col)
log_df.show(5)

In [0]:
log_df = log_df.distinct()
log_df.count()

### Step 2: Create Feature Vectors

In [0]:
from pyspark.ml.feature import VectorAssembler

input_cols=['log_like_count','log_quote_count','log_reply_count','log_retweet_count','log_sentiment_score']
va = VectorAssembler(outputCol="features", inputCols=input_cols)
lpoints = va.transform(log_df)

In [0]:
lpoints.show(3)

In [0]:
lpoints = lpoints.select("features", "Adj Close(1000)").withColumnRenamed("Adj Close(1000)", "label")
lpoints.show(3, False)

### Step 3: Train Test Split

In [0]:
#Divide the dataset into training and vaildation sets.
splits = lpoints.randomSplit([0.8, 0.2],1)

sentiment_train = splits[0]
sentiment_valid = splits[1]

### Step 4: Normalize data

In [0]:
from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler(withStd=True, withMean=True, inputCol='features', outputCol='scaled_features')
train_scaled = standardscaler.fit(sentiment_train).transform(sentiment_train).select('scaled_features', 'label').withColumnRenamed('scaled_features', 'features')

test_scaled = standardscaler.fit(sentiment_train).transform(sentiment_valid)\
              .select('scaled_features', 'label')\
              .withColumnRenamed('scaled_features', 'features')

train_scaled.show(3, truncate=False)

In [0]:
# train_scaled.write.saveAsTable("train_scaled")
# test_scaled.write.saveAsTable("test_scaled")

### Step 5: Train the Model

In [0]:
%%time
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(maxDepth=20)
rfmodel = rf.fit(train_scaled)
# print(rfmodel.toDebugString)

### Step 6: Interpret the model

In [0]:
%%time
validpredicts = rfmodel.transform(test_scaled)
validpredicts.show()

### Step 7: Evaluate the model

In [0]:
#Evaluate the model. default metric : Area Under ROC
from pyspark.ml.evaluation import RegressionEvaluator
reg_evaluator = RegressionEvaluator()
print (reg_evaluator.getMetricName() +":" + str(reg_evaluator.evaluate(validpredicts)))

In [0]:
reg_evaluator.setMetricName('r2')
print (reg_evaluator.getMetricName() +":" + str(reg_evaluator.evaluate(validpredicts)))

In [0]:
reg_evaluator.setMetricName('mse')
print (reg_evaluator.getMetricName() +":" + str(reg_evaluator.evaluate(validpredicts)))

In [0]:
ss.stop()