In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
 

In [6]:
from pyspark.sql.types import StructType,StructField, StringType

schema = StructType([
  StructField('label', StringType(), True),
  StructField('tweet_id', StringType(), True),
  StructField('tweet_text', StringType(), True)
  ])

df1 = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema)
df1.printSchema()

root
 |-- label: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)



In [7]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Very simple predict function. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    l = ["vaccine", "stopasianhate", "covid", "china", "inflation", "biden"]
    for word in l:
        if word in df.tweet_text.lower():
            return word
    else:
        return 'covid'

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    rdd.coalesce(1).saveAsTextFile('file:///C:/Users/lenne/Desktop/spark/coding_and_data/data/lots_of_data.txt')
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    #df.write.saveAsTable(name="table",mode="append")
    

    #df.write.json(path = 'C:/Users/lenne/Desktop/spark/coding_and_data/data', mode  ='append')
    

    
    # Utilize our predict function  #userdefinedfunction(udf)
    df_withpreds = df.withColumn("pred", predict_udf(   
        struct([df[x] for x in df.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [8]:
ssc = StreamingContext(sc, 12345)


In [9]:
lines = ssc.socketTextStream("seppe.net", 7778)
#lines.saveAsTextFiles("file:///C:/Users/lenne/Desktop/spark/coding_and_data/data")
lines.foreachRDD(process)

In [10]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------+-------------------+--------------------+
|         label|           tweet_id|          tweet_text|
+--------------+-------------------+--------------------+
|        #covid|1382954933805195271|@AndhraPradeshCM ...|
|        #covid|1382954848639848453|Our frontline sta...|
|#stopasianhate|1382955297413681154|@lordofthesleepy ...|
|        #covid|1382955669364514818|On 16 Apr 2020 th...|
|        #covid|1382955573092573184|If I remember cor...|
|#stopasianhate|1382955861727920131|What freedom? #██...|
|      #vaccine|1382956181119856649|@PMOIndia should ...|
|      #vaccine|1382956072210554882|If vaccine does n...|
|        #covid|1382956585236844552|@swati_gs Each an...|
|        #covid|1382956583290859520|It's also frustra...|
|    #inflation|1382956786106437632|The NPS will no l...|
|      #vaccine|1382957070735921154|A new interim ceo...|
|      #vaccine|1382956813461557261|BRAIN CLOTS "MORE...|
|        #china|1382957409136635907|#███████ is the s...|
|        #chin

In [None]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
