#### Import Packages

In [2]:
#Basics 
import pandas as pd
import numpy as np
import os
import re
import json

# PySpark Streaming
from pyspark.streaming import StreamingContext
from threading import Thread
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import rand, udf 
from pyspark.sql.types import StringType, ArrayType

#NLTK
from nltk.stem.snowball import SnowballStemmer

# PySPark MLLib
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import  Tokenizer
from pyspark.ml.feature import  IDF, IDFModel
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.feature import IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import StopWordsRemover

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

#### Read streamed files

In [6]:
# Tryout reading invidivual files into RDD format 

rdd2 = spark.sparkContext.textFile("spark/tweets_streamed/tweet-1618824030000")
test = rdd2.collect()
test

['{"tweet_id": 1384072187976450054, "tweet_text": "@theage Can\\u2019t threaten a #\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 program that doesn\\u2019t exist. #\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588", "label": "#covid"}',
 '{"tweet_id": 1384072149002952706, "tweet_text": "#\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 or #\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 Time People thought Corona Vanished started Holding Gatherings ! \\nInstead of utilising time to prepare for all the Panic Today we are facing !! #\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 of Everything\\n#\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 #\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 is not Enough !\\n#\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588\\u2588 \\n@PMOIndia \\ud83c\\uddee\\ud83c\\uddf3\\n@ArvindKejriwal \\n@rashtrapatibhvn", "label": "#covid"}']

In [None]:
# Read streamed tweets into RDD then Python List

counter = 0
tweets = []
saved_texts_list = os.listdir("Desktop/spark/tweets_streamed/")
for text_partition in saved_texts_list:
    rdd = spark.sparkContext.textFile("Desktop/spark/tweets_streamed/" + text_partition)
    for tweet in rdd.collect():
        tweets.append(tweet)
    if (counter % 100 == 0):
        print(str(counter).ljust(5), 'files processed!')
    counter += 1

In [4]:
# Check - Total streamed tweets and value counts

tweet_ids = []
tweet_texts = []
tweet_labels = []

for tweet in tweets:
    tweet_dict = eval(tweet)
    tweet_ids.append(tweet_dict['tweet_id'])
    tweet_texts.append(tweet_dict['tweet_text'])
    tweet_labels.append(tweet_dict['label'])

tweet_df = pd.DataFrame({'id': tweet_ids, 'text': tweet_texts, 'label': tweet_labels})
tweet_df_filtered = tweet_df.drop_duplicates(['id'])
print(tweet_df_filtered.label.value_counts())
print(len(tweet_df_filtered), len(tweet_df))

#vaccine          432
#covid            392
#china            326
#biden            117
#stopasianhate     95
#inflation         52
Name: label, dtype: int64
1414 1456


#### Read from CSV

In [None]:
#To save as a CSV file

tweet_df_filtered["text"] = tweet_df_filtered["text"].str.encode('utf-8', 'ignore').str.decode('utf-8')
tweet_df_filtered.to_csv("Desktop/spark/tweets.csv", index = False, sep = ";")

In [17]:
#To read from the CSV file

tweet_df_filtered = pd.read_csv("Desktop/spark/tweets.csv", sep=';')
tweet_df_filtered

Unnamed: 0,id,text,label
0,1384071564216340485,#███████: People gather in large numbers outsi...,#vaccine
1,1384072059060326401,"But for sure, #███████ is not a solution, we n...",#covid
2,1384072056233361408,We have webinars from @VitaSafety with lots of...,#covid
3,1384072031465984005,For anyone who has #███████ and is isolating a...,#covid
4,1384071995726262276,@ndtv These animals should be put behind bars ...,#covid
...,...,...,...
1409,1384206019362922510,Human Rights Watch Calls Out #███████’s ‘Crime...,#china
1410,1384206642586157060,#███████ talk say #███████ leak from their lab...,#china
1411,1384206577410797568,#███████ loudly tout liberal values &amp; mino...,#china
1412,1384207020694269961,FRESH PORTSMOUTH CAREER MODE!! ZOMBIES FEAT.MU...,#stopasianhate


In [18]:
print(tweet_df_filtered.label.value_counts())
print(len(tweet_df_filtered))

#vaccine          432
#covid            392
#china            326
#biden            117
#stopasianhate     95
#inflation         52
Name: label, dtype: int64
1414


In [19]:
#To read from the CSV file - Nina (Merge with groupmates' stream)

tweet_df_filtered_nina = pd.read_csv("Desktop/spark/tweets_nina.csv", sep=';', lineterminator='\n')
tweet_df_filtered_nina

Unnamed: 0,id,text,label
0,1384845659136831490,Chauvin's murder conviction makes history in A...,#biden
1,1384903605132607492,"#███████ Apology Lacks ‘Sincerity’, #███████’...",#china
2,1384904382580469761,"If Biden can ban words like ""Illegal"" ""Illegal...",#biden
3,1384903264630714372,Maharashtra CM likely to impose lockdown today...,#covid
4,1384873158373691392,"In the #███████, #███████ has more than twice ...",#vaccine
...,...,...,...
1185,1384903106123739136,There is no relevance of #███████ with #██████...,#inflation
1186,1384874658030366724,EMBARRASSING Ontario is reporting 4212 cases o...,#vaccine
1187,1384874577801752578,The Covid-19 pandemic has had a devastating im...,#vaccine
1188,1384852038597681153,#███████ invested $350M in Turkish #███████ pl...,#china


In [20]:
#Merge frames

tweet_df_filtered_merged = pd.concat([tweet_df_filtered, tweet_df_filtered_nina])
tweet_df_filtered_merged = tweet_df_filtered_merged.drop_duplicates(['id'])
tweet_df_filtered_merged

Unnamed: 0,id,text,label
0,1384071564216340485,#███████: People gather in large numbers outsi...,#vaccine
1,1384072059060326401,"But for sure, #███████ is not a solution, we n...",#covid
2,1384072056233361408,We have webinars from @VitaSafety with lots of...,#covid
3,1384072031465984005,For anyone who has #███████ and is isolating a...,#covid
4,1384071995726262276,@ndtv These animals should be put behind bars ...,#covid
...,...,...,...
1185,1384903106123739136,There is no relevance of #███████ with #██████...,#inflation
1186,1384874658030366724,EMBARRASSING Ontario is reporting 4212 cases o...,#vaccine
1187,1384874577801752578,The Covid-19 pandemic has had a devastating im...,#vaccine
1188,1384852038597681153,#███████ invested $350M in Turkish #███████ pl...,#china


In [21]:
# Save to CSV file

tweet_df_filtered_merged["text"] = tweet_df_filtered_merged["text"].str.encode('utf-8', 'ignore').str.decode('utf-8')
tweet_df_filtered_merged.to_csv("Desktop/spark/tweets_merged.csv", index = False, sep = ";")

In [9]:
# Load merged CSV tweets

tweet_df_filtered_merged = pd.read_csv("spark/notebooks/tweets_merged.csv", sep = ";")

In [10]:
# Final hashtag (label) value counts - Total 2604 tweets streamed

tweet_df_filtered = tweet_df_filtered_merged
print(tweet_df_filtered.label.value_counts())

#vaccine          744
#covid            672
#china            649
#biden            255
#stopasianhate    184
#inflation        100
Name: label, dtype: int64


#### Text pre-processing (remove punctuation, blocks and trim spaces)

In [14]:
# Clean the text by substitutions

def remove_punct(text):
    #clean  new line
    text = re.sub('\n', ' ', text)
    #clean full blocks
    text = re.sub('\u2588', '', text)
    #number of #hashtags
    text = text + ' ' + str(text.count('#'))
    #clean #
    text = re.sub('#', '', text)
    #Clean links
    text = re.sub('http.* ', ' urllink ', text)
    #multispace to singlespace
    text = re.sub(' +', ' ', text)
    #exp ozge's to ozges
    text = re.sub('\'', '', text)
    text = re.sub('\’', '', text)
    #create a space for punctuation
    text = re.sub('[!"$%&\()*+,-./:;<=>?@[\\]^_`{|}~“”]', ' ', text)
    #multispace to singlespace
    text = re.sub(' +', ' ', text)
    return text.lower()

tweet_df_filtered = tweet_df_filtered.reset_index(drop=True)
tweet_df_filtered['cleaned_text'] = ''
for i in range(len(tweet_df_filtered)):
    tweet_df_filtered.loc[i, 'cleaned_text']= remove_punct(tweet_df_filtered['text'][i])

In [17]:
# Example of a raw tweet 

tweet_df_filtered.loc[1500]['text']

'“WE WILL NOT BE INTIMIDATED.” DESPITE CHINA’S THREATS, LITHUANIA MOVES TO RECOGNISE UIGHUR GENOCIDENEW\n\nhttps://t.co/hLjIGhBFST\n#███████ #███████ #███████ #███████ #███████ #███████ #███████\n#███████ https://t.co/gDocjvTSof'

In [20]:
# Example of a the cleaned vrsion of the above tweet
# We additionally include hasgtag count as an integer so that document vector transformations can notice
# Additionally, we swich each URL with a unique keyword 'urllink' for text # Example of a raw tweet 

tweet_df_filtered.loc[1500]['cleaned_text']

' we will not be intimidated despite chinas threats lithuania moves to recognise uighur genocidenew urllink 8'

#### Model training with MLib

In [21]:
#Prepare data to create a PySpark dataframe

spark_tuples = []
for i in range(len(tweet_df_filtered)):
    spark_tuples.append((tweet_df_filtered['label'][i], tweet_df_filtered['cleaned_text'][i]))  
    
#Create PySpark Dataframe

tweet_data = spark.createDataFrame(spark_tuples, ["label", "tweet"])

In [22]:
tweet_data.show(10)

+--------+--------------------+
|   label|               tweet|
+--------+--------------------+
|#vaccine| people gather in...|
|  #covid|but for sure is n...|
|  #covid|we have webinars ...|
|  #covid|for anyone who ha...|
|  #covid| ndtv these anima...|
|  #covid|starting the week...|
|  #covid| theage cant thre...|
|  #covid| or time people t...|
|  #covid|addressing the si...|
|  #china|the chinese offic...|
+--------+--------------------+
only showing top 10 rows



In [23]:
# Oversampling

highest_label_count = tweet_data.where(tweet_data['label']=='#vaccine').count()
covid_oversampled = tweet_data.where(tweet_data['label']=='#covid').sample(True, (highest_label_count/(tweet_data.where(tweet_data['label']=='#covid').count())-1), seed = 123)
china_oversampled = tweet_data.where(tweet_data['label']=='#china').sample(True, (highest_label_count/(tweet_data.where(tweet_data['label']=='#china').count())-1), seed = 123)
biden_oversampled = tweet_data.where(tweet_data['label']=='#biden').sample(True, (highest_label_count/(tweet_data.where(tweet_data['label']=='#biden').count())-1), seed = 123)
stopasianhate_oversampled = tweet_data.where(tweet_data['label']=='#stopasianhate').sample(True, (highest_label_count/(tweet_data.where(tweet_data['label']=='#stopasianhate').count())-1), seed = 123)
inflation_oversampled = tweet_data.where(tweet_data['label']=='#inflation').sample(True, (highest_label_count/(tweet_data.where(tweet_data['label']=='#inflation').count())-1), seed = 123)

# Merged oversampled tweets

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

oversampled_df = unionAll(covid_oversampled, china_oversampled, biden_oversampled, stopasianhate_oversampled, inflation_oversampled, tweet_data)
oversampled_df.groupBy('label').count().show()
oversampled_df = oversampled_df.orderBy(rand())
oversampled_df.show(10)

+--------------+-----+
|         label|count|
+--------------+-----+
|        #biden|  734|
|      #vaccine|  744|
|        #china|  735|
|        #covid|  734|
|#stopasianhate|  725|
|    #inflation|  710|
+--------------+-----+

+--------------+--------------------+
|         label|               tweet|
+--------------+--------------------+
|        #covid|district overview...|
|        #biden|           urllink 5|
|        #china|we can see why th...|
|    #inflation| official central...|
|        #china| arabia firmly su...|
|        #china|a shenzhen compan...|
|        #china| loudly tout libe...|
|      #vaccine|thank you drnigha...|
|#stopasianhate| hcphtx readyharr...|
|      #vaccine|as of today every...|
+--------------+--------------------+
only showing top 10 rows



In [46]:
##################################################
#Count vectorizer + idf approach - Transformation#
##################################################

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(tweet_data)

#Removing stop words
remover = StopWordsRemover(inputCol = "words", outputCol = "cleaned_words")
df_cleaned_text = remover.transform(wordsData)

#Stemming
stemmer = SnowballStemmer(language = "english")
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_cleaned_text.withColumn("cleaned_words", stemmer_udf("cleaned_words"))
df_stemmed.show(5)

#Term frequency-TF
count = CountVectorizer(inputCol="cleaned_words", outputCol="rawFeatures", vocabSize=200)
count_vectorizer_model = count.fit(df_stemmed)
featurizedData = count_vectorizer_model.transform(df_stemmed)

#IDF 
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#Showing
rescaledData.select("label", "features").show()

+--------+--------------------+--------------------+--------------------+
|   label|               tweet|               words|       cleaned_words|
+--------+--------------------+--------------------+--------------------+
|#vaccine| people gather in...|[, people, gather...|[, peopl, gather,...|
|  #covid|but for sure is n...|[but, for, sure, ...|[sure, solut, nee...|
|  #covid|we have webinars ...|[we, have, webina...|[webinar, vitasaf...|
|  #covid|for anyone who ha...|[for, anyone, who...|[anyon, isol, hom...|
|  #covid| ndtv these anima...|[, ndtv, these, a...|[, ndtv, anim, pu...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows

+--------------+--------------------+
|         label|            features|
+--------------+--------------------+
|      #vaccine|(200,[0,1,12,62,1...|
|        #covid|(200,[4,25,70],[1...|
|        #covid|(200,[0,2,123,164...|
|        #covid|(200,[0,3,7,101,1...|
|        #covid|(200,[1,2],[1.144...|
|    

In [169]:
# DO NOT USE - JUST EXPERIMENTAL
############################################
#Hashing TF + idf approach - Transformation#
############################################

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(oversampled_df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=50)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()

+--------------+--------------------+
|         label|            features|
+--------------+--------------------+
|    #inflation|(50,[1,3,4,6,12,1...|
|#stopasianhate|(50,[2,6,7,13,15,...|
|        #covid|(50,[15,17,18,42,...|
|    #inflation|(50,[4,6,13,18,23...|
|        #china|(50,[6,16,21,22,2...|
|#stopasianhate|(50,[1,5,6,7,9,10...|
|        #china|(50,[5,10,11,14,1...|
|    #inflation|(50,[18,20,22,24]...|
|      #vaccine|(50,[1,3,6,10,15,...|
|        #covid|(50,[1,3,5,6,10,1...|
|        #biden|(50,[0,7,30,41,42...|
|      #vaccine|(50,[6,9,10,11,13...|
|        #biden|(50,[0,2,3,4,6,7,...|
|      #vaccine|(50,[3,4,5,7,11,1...|
|        #biden|(50,[1,2,5,9,11,1...|
|        #covid|(50,[0,4,9,12,17,...|
|#stopasianhate|(50,[0,1,3,6,7,8,...|
|    #inflation|(50,[6,10,11,14,1...|
|      #vaccine|(50,[3,4,5,6,7,17...|
|        #covid|(50,[0,2,5,6,11,1...|
+--------------+--------------------+
only showing top 20 rows



In [47]:
# Training and Test Split 
# Fianlly, we train with all the data but used for model search during grid search

(trainingData, testData) = rescaledData.randomSplit([1.0, 0.0])

##### Logistic Regression

In [35]:
#Create a label indexer (string labels to double is required)

labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
lr = LogisticRegression(maxIter=50, labelCol="indexedLabel")
pipeline = Pipeline(stages=[labelIndexer, lr])

#Fit the pipeline to training documents

model = pipeline.fit(trainingData)

#Print the coefficients and intercept for multinomial logistic regression

print("Coefficients: \n" + str(model.stages[-1].coefficientMatrix))
print("Intercept: " + str(model.stages[-1].interceptVector))

Coefficients: 
DenseMatrix([[ 0.7051162 , -0.17784361,  0.2248641 , ...,  0.43509169,
               0.43435063, -0.20761375],
             [ 0.11836464, -0.29468858,  0.26707827, ...,  0.27631729,
               0.53671436, -0.29302385],
             [ 1.90021664,  0.36287638, -0.04132572, ...,  0.3123655 ,
               0.50760221, -0.13386779],
             [ 0.92805262,  0.19079111, -0.70005112, ...,  0.15253045,
              -1.29589792, -0.51441475],
             [-2.3166265 , -0.32894427, -0.02704546, ..., -1.96633118,
               0.7465178 ,  0.68618354],
             [-1.33512361,  0.24780896,  0.27647994, ...,  0.79002626,
              -0.92928709,  0.4627366 ]])
Intercept: [0.4799018948267931,0.5119622494672291,0.3831667021129153,-0.010408835666568904,0.10695604917990899,-1.4715780599202777]


In [36]:
#Save the models for deployment

#CountVectorizer
count_vectorizer_model.save('count_vectorizer_model_prototype_5')

#IDF
idfModel.save('idf_model_prototype_5')

#Indexed Labels
indexed_df = labelIndexer.transform(trainingData)
meta = [f.metadata for f in indexed_df.schema.fields if f.name == "indexedLabel"]
index_dict = dict(enumerate(meta[0]["ml_attr"]["vals"]))

index_dict['0.0'] = index_dict.pop(0)
index_dict['1.0'] = index_dict.pop(1)
index_dict['2.0'] = index_dict.pop(2)
index_dict['3.0'] = index_dict.pop(3)
index_dict['4.0'] = index_dict.pop(4)
index_dict['5.0'] = index_dict.pop(5)

with open('index_hashtag_dict.json', 'w') as fp:
    json.dump(index_dict, fp)
    
#ML model

model.stages[-1].save("logistic_regression_model_prototype_5")

In [None]:
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(50)

##### RandomForest 

In [48]:
#Create a label indexer (string labels to double is required)

labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
rf = RandomForestClassifier(labelCol = "indexedLabel", numTrees = 100)
pipeline = Pipeline(stages=[labelIndexer, rf])

#Fit the pipeline to training documents

model = pipeline.fit(trainingData)

In [49]:
#Save the models for deployment

#CountVectorizer
count_vectorizer_model.save('count_vectorizer_model_prototype_7')

#IDF
idfModel.save('idf_model_prototype_7')

#Indexed Labels
indexed_df = labelIndexer.transform(trainingData)
meta = [f.metadata for f in indexed_df.schema.fields if f.name == "indexedLabel"]
index_dict = dict(enumerate(meta[0]["ml_attr"]["vals"]))

index_dict['0.0'] = index_dict.pop(0)
index_dict['1.0'] = index_dict.pop(1)
index_dict['2.0'] = index_dict.pop(2)
index_dict['3.0'] = index_dict.pop(3)
index_dict['4.0'] = index_dict.pop(4)
index_dict['5.0'] = index_dict.pop(5)

import json
with open('index_hashtag_dict.json', 'w') as fp:
    json.dump(index_dict, fp)
    
#ML model

model.stages[-1].save("random_forest_model_prototype_2")

#### Cross-validation and Parameter Search

In [41]:
# For cross-validation

stringIndexer = StringIndexer(inputCol="label", outputCol="newlabel")
model = stringIndexer.fit(rescaledData)
df = model.transform(rescaledData)
cv_set = df.selectExpr("features as features", "newlabel as label")
cv_set.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [42]:
lr = LogisticRegression(maxIter = 20)
paramGrid_lr = ParamGridBuilder().addGrid(lr.regParam, np.linspace(0.3, 0.01, 3)).addGrid(lr.elasticNetParam, np.linspace(0.3, 0.8, 6)).build()
crossval_lr = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid_lr,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds= 5)  
                
cvModel_lr = crossval_lr.fit(cv_set)
best_model_lr = cvModel_lr.bestModel.summary

In [55]:
best_model_lr = cvModel_lr.bestModel
print('Best Param (regParam): ', best_model_lr._java_obj.getRegParam())
print('Best Param (elasticNetParam): ', best_model_lr._java_obj.getElasticNetParam())

Best Param (regParam):  0.01
Best Param (elasticNetParam):  0.5


#### Fit the best parameters

In [56]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
lr = LogisticRegression(maxIter=20, regParam=0.01, elasticNetParam=0.5, labelCol="indexedLabel")
pipeline = Pipeline(stages=[labelIndexer, lr])

#Fit the pipeline to training documents

model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(50)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         3.0|(50,[1,2,3,7,21,2...|
|       2.0|         3.0|(50,[1,2,13,16,17...|
|       0.0|         3.0|(50,[1,3,4,6,7,9,...|
|       0.0|         3.0|(50,[1,2,4,6,8,10...|
|       1.0|         3.0|(50,[0,1,2,5,9,13...|
|       2.0|         3.0|(50,[0,1,6,13,21,...|
|       1.0|         2.0|(50,[0,1,5,7,8,9,...|
|       1.0|         2.0|(50,[1,8,12,13,15...|
|       1.0|         2.0|(50,[1,4,6,11,13,...|
|       1.0|         2.0|(50,[0,1,3,12,16,...|
|       0.0|         2.0|(50,[0,1,3,5,6,7,...|
|       0.0|         2.0|(50,[1,2,3,6,9,10...|
|       0.0|         2.0|(50,[1,4,5,8,13,1...|
|       2.0|         2.0|(50,[1,4,6,7,8,9,...|
|       0.0|         2.0|(50,[0,1,2,17,20,...|
|       0.0|         2.0|(50,[0,1,3,5,6,8,...|
|       1.0|         2.0|(50,[0,1,2,6,9,10...|
|       2.0|         2.0|(50,[1,3,4,5,14,2...|
|       1.0| 

#### Test for deployment - Random Streamed Tweets

In [150]:
idf_model = IDFModel.load('idf_model_prototype_1')
with open('index_hashtag_dict.json', 'r') as fp:
    index_dict = json.load(fp)
    
def process(rdd):
    df = spark.read.json(rdd)
    tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
    wordsData = tokenizer.transform(df)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=50)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idf_model.transform(featurizedData)
    predictions = model.transform(rescaledData)
    predictions = predictions.withColumn("prediction", predictions["prediction"].cast(StringType()))
    predictions = predictions.replace(to_replace=index_dict, subset='prediction')
    predictions.select("label", "tweet_id", "tweet_text", "prediction").show(50)

process('C:/Users/nusret/Desktop/spark/myoutput-1619177010000')

+------+-------------------+--------------------+----------+
| label|           tweet_id|          tweet_text|prediction|
+------+-------------------+--------------------+----------+
|#china|1384110015968014336|Now @MercedesBenz...|  #vaccine|
|#china|1384109980769394690|@woonomic I thoug...|  #vaccine|
+------+-------------------+--------------------+----------+



In [174]:
def remove_punct(text):
    text = re.sub('\n', ' ', text)
    text = re.sub('\u2588', '', text)
    text = text + ' ' + str(text.count('#'))
    text = re.sub('#', '', text)
    text = re.sub('http.* ', ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('\'', '', text)
    text = re.sub('\’', '', text)
    text = re.sub('[!"$%&\()*+,-./:;<=>?@[\\]^_`{|}~“”]', ' ', text)
    text = re.sub(' +', ' ', text)
    return text

def process(rdd):
    df = spark.read.json(rdd)
    udf_myFunction = udf(remove_punct, StringType())
    df = df.withColumn("tweet_text", udf_myFunction("tweet_text"))
    df.show()
process('Desktop/spark/tweets_streamed/tweet-1618832570000')

+------+-------------------+--------------------+
| label|           tweet_id|          tweet_text|
+------+-------------------+--------------------+
|#china|1384110015968014336|Now MercedesBenz ...|
|#china|1384109980769394690| woonomic I thoug...|
+------+-------------------+--------------------+



In [50]:
index_dict

{'0.0': '#vaccine',
 '1.0': '#covid',
 '2.0': '#china',
 '3.0': '#biden',
 '4.0': '#stopasianhate',
 '5.0': '#inflation'}