In [0]:
# todo remove non relevant locations, locations not associated with a city or country
# remove polarities above 1 and below -1, maybe round polarities to -1 or 1?

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# import natural language tool kit to help clean text
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer

test_file_path = "/FileStore/tables/07_2020.csv"

# Define Schema
schema = StructType([StructField('created_at', TimestampType(), True),
                     StructField('file_name', StringType(), True),
                     StructField('followers', IntegerType(), True),
                     StructField('friends', IntegerType(), True),
                     StructField('group_name', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('retweet_count', IntegerType(), True),
                     StructField('screenname', StringType(), True),
                     StructField('search_query', StringType(), True),
                     StructField('text', StringType(), True),
                     StructField('twitter_id', StringType(), True),
                     StructField('username', StringType(), True),
                     StructField('polarity', StringType(), True),                  
                     StructField('partition_0', StringType(), True),      
                     StructField('partition_1', StringType(), True)])

# must read polarity as a string then cast to float later after removing non numeric rows
df = spark.read.csv(test_file_path, header=True, schema=schema)
df = df.select("created_at", "search_query", "text", "polarity")

print(df.count())

# remove non float type polarities
df_filtered = df.filter(col("polarity").cast(FloatType()).isNotNull()).select("created_at", "search_query", "text", col("polarity").cast(FloatType()))

# remove polarities above 1 and below -1
df_filtered = df_filtered.filter("polarity < 1 and polarity > -1")

# convert polarities to 3 classes (0, 1, 3, negative, neutral, positive)
udf_polarity_threshold = udf(lambda x: 0 if (x < -0.1) else (1 if (x <= 0.1) else 2), IntegerType())
df_filtered = df_filtered.withColumn("polarity_class", udf_polarity_threshold("polarity"))

# remove links
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, "\'", ''))
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '\s+', ' '))

# tokenize the text into words
df_filtered = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df_filtered)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_stem", udf_stem("words_cleaned"))


display(df_filtered)

df_filtered = df_filtered.select("created_at", "search_query", "text", "polarity_class", "words_stem")

display(df_filtered)
df_filtered.printSchema()


1242243
root
 |-- created_at: timestamp (nullable = true)
 |-- search_query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


created_at,search_query,text,polarity,polarity_class,text_cleaned,words,words_cleaned,words_stem
2020-07-12T03:45:47.000+0000,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,0.0,1,RT Streetcar honda Cr Owner L PJapanTAKERU Honda Civic FD MugenRR l pjapan Japan,"List(rt, streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)"
2020-07-12T03:44:41.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:44:13.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:43:32.000+0000,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,0.0,1,since tobebeautifu tatioactivedx tatio tatio shape shapeslimming softgel sofrgelcapsules fda,"List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, sinc, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslim, softgel, sofrgelcapsul, fda)"
2020-07-12T03:43:16.000+0000,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,0.0,1,HIROSHIMA THE NEXT DAY atomic bomb nuclear japan history,"List(, hiroshima, the, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atom, bomb, nuclear, japan, histori)"
2020-07-12T03:42:12.000+0000,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,0.0,1,RT Nuke Info Regulator demands TEPCO clarify responsibilities NHK WORLD JAPAN News,"List(rt, nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regul, demand, tepco, clarifi, respons, nhk, world, japan, news)"
2020-07-12T03:41:55.000+0000,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,0.6597,2,RT AlArabiya Eng Watch The former Nissan boss Carlos Ghosn shares new details on his daring escape from Japan while under close surve,"List(rt, alarabiya, eng, watch, the, former, nissan, boss, carlos, ghosn, shares, new, details, on, his, daring, escape, from, japan, while, under, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlos, ghosn, shares, new, details, daring, escape, japan, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlo, ghosn, share, new, detail, dare, escap, japan, close, surv)"
2020-07-12T03:41:49.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:47.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:40.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"


created_at,search_query,text,polarity_class,words_stem
2020-07-12T03:45:47.000+0000,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,1,"List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)"
2020-07-12T03:44:41.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,2,"List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:44:13.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,2,"List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:43:32.000+0000,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,1,"List(, sinc, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslim, softgel, sofrgelcapsul, fda)"
2020-07-12T03:43:16.000+0000,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,1,"List(, hiroshima, next, day, atom, bomb, nuclear, japan, histori)"
2020-07-12T03:42:12.000+0000,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,1,"List(nuke, info, regul, demand, tepco, clarifi, respons, nhk, world, japan, news)"
2020-07-12T03:41:55.000+0000,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,2,"List(alarabiya, eng, watch, former, nissan, boss, carlo, ghosn, share, new, detail, dare, escap, japan, close, surv)"
2020-07-12T03:41:49.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:47.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:40.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"


In [0]:
# Split into test and train
df_filtered.printSchema()

trainDF, testDF = df_filtered.randomSplit([.8, .2], seed=42)

root
 |-- created_at: timestamp (nullable = true)
 |-- search_query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
# Set up pipeline components

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()
cv.setInputCol("words_stem")
cv.setOutputCol("vectorized_text")

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(modelType="multinomial", featuresCol="vectorized_text", labelCol="polarity_class")

In [0]:

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[cv, nb])
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
display(predDF.select("text", "polarity_class", "prediction"))
predDF = predDF.select("text", "polarity_class", col("prediction").cast(DoubleType()))
predDF.printSchema()

root
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- prediction: double (nullable = false)



text,polarity_class,prediction
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
"RT @simjhenderson: 1. With the passage of the #NationalSecurityLaw in #HongKong, #Australia should immediately terminate the Australia-Hong…",1,2.0
RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,2.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"RT @alphacentauriii: Pro-#HongKong democracy students in Perth allege death threats, intimidation from Chinese nationals. The Australian Fe…",0,0.0
TikTok to leave Hong Kong market ‘within days’. It was stated by #TikTok that it will quit the Hong Kong market aft… https://t.co/YLd022lYxw,1,0.0


In [0]:
from sklearn.metrics import accuracy_score
import numpy as np
print("accuracy: ", accuracy_score(np.array(testDF.select("polarity_class").collect()), predDF.select("prediction").collect()))

accuracy:  0.8320457620863575


In [0]:
df2 = spark.read.option("wholeFile", True)\
                .option("multiline", True)\
                .option("header", True)\
                .option("escape","\"")\
                .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
                .csv("/FileStore/tables/US_Covid.csv")\

df2 = df2.dropna()
display(df2)


text,created_at,search_query
#Zoonosis Scientists have evidence #SARSCoV2 spreads explosively in white-tailed #deer + that the virus is widespread in this deer population across #US. Scientists say this could have vast implications for long-term course of the #pandemic. #BeVegan https://t.co/1d27EW63wb,2021-12-06T17:56:09.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
New COVID-19 Variant Omicron: Latest US Travel Restrictions 🇺🇸 #omicron #COVID19 #UnitedStates #travel #TravelRestrictions https://t.co/eQl7mZciYw,2021-12-06T17:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
CDC: 99% of Cases in the US are from Delta variant. #DeltaVariant #CDC #US #COVID19 https://t.co/gDOqSr0BC0,2021-12-06T17:39:50.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"The #US, hit hard by the #pandemic, is engineering a switch to #telehealth which, historically, has only been used by self-paying patients. US private health #insurers & even the various Medicaid programs - allow more reimbursement for telehealth. #ACFView https://t.co/PXqkhKubig https://t.co/VLIZrcWlVx",2021-12-06T17:07:00.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
#Manufacturing in the #US grew in the month of #November by a margin of .3 points over the previous month and of 11 points overall according to a #report published last week. The industry is one of a few to see steady growth through the #covid19 pandemic. https://t.co/uWM96rVUQi https://t.co/8O41hZrVX0,2021-12-06T17:00:31.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
@medrxivpreprint Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA,2021-12-06T16:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA https://t.co/kR9g0omEvs,2021-12-06T16:49:41.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
New York's workers must all have vaccine by 27 December via @BBCNews https://t.co/myDGlI9nTb #US #NY #states #COVID19 #Omicron #DeltaVariant #vaccines #economy #economics #workers #wages #families #Politics #politicalparties #RegionalSecurity #recovery,2021-12-06T16:41:08.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"Update on the 7-day Avg. of new #covid19 Deaths reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of deaths are: #Missouri #Michigan #California #Pennsylvania #Texas #omicron #coronavirus #Illinois #NewJersey https://t.co/luOYgczOx2",2021-12-06T16:27:40.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"Update on the 7-day Avg. of new #covid19 cases reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of cases: #Michigan #NewYork #Ohio #Pennsylvania #Illinois #omicron #coronavirus #covid_19 #Georgia #Arizona https://t.co/qgY5aq5JRB",2021-12-06T16:21:56.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets


In [0]:
# remove links
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, "\'", ''))
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, '\s+', ' '))

# tokenize the text into words
df2 = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df2)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df2 = df2.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df2 = df2.withColumn("words_stem", udf_stem("words_cleaned"))


display(df2)

df2 = df2.select("created_at", "search_query", "text", "words_stem")

display(df2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


text,created_at,search_query,text_cleaned,words,words_cleaned,words_stem
#Zoonosis Scientists have evidence #SARSCoV2 spreads explosively in white-tailed #deer + that the virus is widespread in this deer population across #US. Scientists say this could have vast implications for long-term course of the #pandemic. #BeVegan https://t.co/1d27EW63wb,2021-12-06T17:56:09.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Zoonosis Scientists have evidence SARSCoV spreads explosively in white tailed deer that the virus is widespread in this deer population across US Scientists say this could have vast implications for long term course of the pandemic BeVegan,"List(, zoonosis, scientists, have, evidence, sarscov, spreads, explosively, in, white, tailed, deer, that, the, virus, is, widespread, in, this, deer, population, across, us, scientists, say, this, could, have, vast, implications, for, long, term, course, of, the, pandemic, bevegan)","List(, zoonosis, scientists, evidence, sarscov, spreads, explosively, white, tailed, deer, virus, widespread, deer, population, across, us, scientists, say, could, vast, implications, long, term, course, pandemic, bevegan)","List(, zoonosi, scientist, evid, sarscov, spread, explos, white, tail, deer, viru, widespread, deer, popul, across, us, scientist, say, could, vast, implic, long, term, cours, pandem, bevegan)"
New COVID-19 Variant Omicron: Latest US Travel Restrictions 🇺🇸 #omicron #COVID19 #UnitedStates #travel #TravelRestrictions https://t.co/eQl7mZciYw,2021-12-06T17:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,New COVID Variant Omicron Latest US Travel Restrictions omicron COVID UnitedStates travel TravelRestrictions,"List(new, covid, variant, omicron, latest, us, travel, restrictions, omicron, covid, unitedstates, travel, travelrestrictions)","List(new, covid, variant, omicron, latest, us, travel, restrictions, omicron, covid, unitedstates, travel, travelrestrictions)","List(new, covid, variant, omicron, latest, us, travel, restrict, omicron, covid, unitedst, travel, travelrestrict)"
CDC: 99% of Cases in the US are from Delta variant. #DeltaVariant #CDC #US #COVID19 https://t.co/gDOqSr0BC0,2021-12-06T17:39:50.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,CDC of Cases in the US are from Delta variant DeltaVariant CDC US COVID,"List(cdc, of, cases, in, the, us, are, from, delta, variant, deltavariant, cdc, us, covid)","List(cdc, cases, us, delta, variant, deltavariant, cdc, us, covid)","List(cdc, case, us, delta, variant, deltavari, cdc, us, covid)"
"The #US, hit hard by the #pandemic, is engineering a switch to #telehealth which, historically, has only been used by self-paying patients. US private health #insurers & even the various Medicaid programs - allow more reimbursement for telehealth. #ACFView https://t.co/PXqkhKubig https://t.co/VLIZrcWlVx",2021-12-06T17:07:00.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,The US hit hard by the pandemic is engineering a switch to telehealth which historically has only been used by self paying patients US private health insurers amp even the various Medicaid programs allow more reimbursement for telehealth ACFView,"List(the, us, hit, hard, by, the, pandemic, is, engineering, a, switch, to, telehealth, which, historically, has, only, been, used, by, self, paying, patients, us, private, health, insurers, amp, even, the, various, medicaid, programs, allow, more, reimbursement, for, telehealth, acfview)","List(us, hit, hard, pandemic, engineering, switch, telehealth, historically, used, self, paying, patients, us, private, health, insurers, amp, even, various, medicaid, programs, allow, reimbursement, telehealth, acfview)","List(us, hit, hard, pandem, engin, switch, telehealth, histor, use, self, pay, patient, us, privat, health, insur, amp, even, variou, medicaid, program, allow, reimburs, telehealth, acfview)"
#Manufacturing in the #US grew in the month of #November by a margin of .3 points over the previous month and of 11 points overall according to a #report published last week. The industry is one of a few to see steady growth through the #covid19 pandemic. https://t.co/uWM96rVUQi https://t.co/8O41hZrVX0,2021-12-06T17:00:31.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Manufacturing in the US grew in the month of November by a margin of points over the previous month and of points overall according to a report published last week The industry is one of a few to see steady growth through the covid pandemic,"List(, manufacturing, in, the, us, grew, in, the, month, of, november, by, a, margin, of, points, over, the, previous, month, and, of, points, overall, according, to, a, report, published, last, week, the, industry, is, one, of, a, few, to, see, steady, growth, through, the, covid, pandemic)","List(, manufacturing, us, grew, month, november, margin, points, previous, month, points, overall, according, report, published, last, week, industry, one, see, steady, growth, covid, pandemic)","List(, manufactur, us, grew, month, novemb, margin, point, previou, month, point, overal, accord, report, publish, last, week, industri, one, see, steadi, growth, covid, pandem)"
@medrxivpreprint Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA,2021-12-06T16:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,medrxivpreprint Inequalities of the US COVID vaccination drive medrxivpreprint UCLA Cornell COVID coronavirus covid SARSCoV vaccine vaccination US USA,"List(, medrxivpreprint, inequalities, of, the, us, covid, vaccination, drive, medrxivpreprint, ucla, cornell, covid, coronavirus, covid, sarscov, vaccine, vaccination, us, usa)","List(, medrxivpreprint, inequalities, us, covid, vaccination, drive, medrxivpreprint, ucla, cornell, covid, coronavirus, covid, sarscov, vaccine, vaccination, us, usa)","List(, medrxivpreprint, inequ, us, covid, vaccin, drive, medrxivpreprint, ucla, cornel, covid, coronaviru, covid, sarscov, vaccin, vaccin, us, usa)"
Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA https://t.co/kR9g0omEvs,2021-12-06T16:49:41.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Inequalities of the US COVID vaccination drive medrxivpreprint UCLA Cornell COVID coronavirus covid SARSCoV vaccine vaccination US USA,"List(inequalities, of, the, us, covid, vaccination, drive, medrxivpreprint, ucla, cornell, covid, coronavirus, covid, sarscov, vaccine, vaccination, us, usa)","List(inequalities, us, covid, vaccination, drive, medrxivpreprint, ucla, cornell, covid, coronavirus, covid, sarscov, vaccine, vaccination, us, usa)","List(inequ, us, covid, vaccin, drive, medrxivpreprint, ucla, cornel, covid, coronaviru, covid, sarscov, vaccin, vaccin, us, usa)"
New York's workers must all have vaccine by 27 December via @BBCNews https://t.co/myDGlI9nTb #US #NY #states #COVID19 #Omicron #DeltaVariant #vaccines #economy #economics #workers #wages #families #Politics #politicalparties #RegionalSecurity #recovery,2021-12-06T16:41:08.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,New Yorks workers must all have vaccine by December via BBCNews US NY states COVID Omicron DeltaVariant vaccines economy economics workers wages families Politics politicalparties RegionalSecurity recovery,"List(new, yorks, workers, must, all, have, vaccine, by, december, via, bbcnews, us, ny, states, covid, omicron, deltavariant, vaccines, economy, economics, workers, wages, families, politics, politicalparties, regionalsecurity, recovery)","List(new, yorks, workers, must, vaccine, december, via, bbcnews, us, ny, states, covid, omicron, deltavariant, vaccines, economy, economics, workers, wages, families, politics, politicalparties, regionalsecurity, recovery)","List(new, york, worker, must, vaccin, decemb, via, bbcnew, us, ny, state, covid, omicron, deltavari, vaccin, economi, econom, worker, wage, famili, polit, politicalparti, regionalsecur, recoveri)"
"Update on the 7-day Avg. of new #covid19 Deaths reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of deaths are: #Missouri #Michigan #California #Pennsylvania #Texas #omicron #coronavirus #Illinois #NewJersey https://t.co/luOYgczOx2",2021-12-06T16:27:40.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Update on the day Avg of new covid Deaths reported by UnitedStates states As at yesterday top states with high day average of deaths are Missouri Michigan California Pennsylvania Texas omicron coronavirus Illinois NewJersey,"List(update, on, the, day, avg, of, new, covid, deaths, reported, by, unitedstates, states, as, at, yesterday, top, states, with, high, day, average, of, deaths, are, missouri, michigan, california, pennsylvania, texas, omicron, coronavirus, illinois, newjersey)","List(update, day, avg, new, covid, deaths, reported, unitedstates, states, yesterday, top, states, high, day, average, deaths, missouri, michigan, california, pennsylvania, texas, omicron, coronavirus, illinois, newjersey)","List(updat, day, avg, new, covid, death, report, unitedst, state, yesterday, top, state, high, day, averag, death, missouri, michigan, california, pennsylvania, texa, omicron, coronaviru, illinoi, newjersey)"
"Update on the 7-day Avg. of new #covid19 cases reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of cases: #Michigan #NewYork #Ohio #Pennsylvania #Illinois #omicron #coronavirus #covid_19 #Georgia #Arizona https://t.co/qgY5aq5JRB",2021-12-06T16:21:56.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Update on the day Avg of new covid cases reported by UnitedStates states As at yesterday top states with high day average of cases Michigan NewYork Ohio Pennsylvania Illinois omicron coronavirus covid Georgia Arizona,"List(update, on, the, day, avg, of, new, covid, cases, reported, by, unitedstates, states, as, at, yesterday, top, states, with, high, day, average, of, cases, michigan, newyork, ohio, pennsylvania, illinois, omicron, coronavirus, covid, georgia, arizona)","List(update, day, avg, new, covid, cases, reported, unitedstates, states, yesterday, top, states, high, day, average, cases, michigan, newyork, ohio, pennsylvania, illinois, omicron, coronavirus, covid, georgia, arizona)","List(updat, day, avg, new, covid, case, report, unitedst, state, yesterday, top, state, high, day, averag, case, michigan, newyork, ohio, pennsylvania, illinoi, omicron, coronaviru, covid, georgia, arizona)"


created_at,search_query,text,words_stem
2021-12-06T17:56:09.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,#Zoonosis Scientists have evidence #SARSCoV2 spreads explosively in white-tailed #deer + that the virus is widespread in this deer population across #US. Scientists say this could have vast implications for long-term course of the #pandemic. #BeVegan https://t.co/1d27EW63wb,"List(, zoonosi, scientist, evid, sarscov, spread, explos, white, tail, deer, viru, widespread, deer, popul, across, us, scientist, say, could, vast, implic, long, term, cours, pandem, bevegan)"
2021-12-06T17:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,New COVID-19 Variant Omicron: Latest US Travel Restrictions 🇺🇸 #omicron #COVID19 #UnitedStates #travel #TravelRestrictions https://t.co/eQl7mZciYw,"List(new, covid, variant, omicron, latest, us, travel, restrict, omicron, covid, unitedst, travel, travelrestrict)"
2021-12-06T17:39:50.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,CDC: 99% of Cases in the US are from Delta variant. #DeltaVariant #CDC #US #COVID19 https://t.co/gDOqSr0BC0,"List(cdc, case, us, delta, variant, deltavari, cdc, us, covid)"
2021-12-06T17:07:00.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"The #US, hit hard by the #pandemic, is engineering a switch to #telehealth which, historically, has only been used by self-paying patients. US private health #insurers & even the various Medicaid programs - allow more reimbursement for telehealth. #ACFView https://t.co/PXqkhKubig https://t.co/VLIZrcWlVx","List(us, hit, hard, pandem, engin, switch, telehealth, histor, use, self, pay, patient, us, privat, health, insur, amp, even, variou, medicaid, program, allow, reimburs, telehealth, acfview)"
2021-12-06T17:00:31.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,#Manufacturing in the #US grew in the month of #November by a margin of .3 points over the previous month and of 11 points overall according to a #report published last week. The industry is one of a few to see steady growth through the #covid19 pandemic. https://t.co/uWM96rVUQi https://t.co/8O41hZrVX0,"List(, manufactur, us, grew, month, novemb, margin, point, previou, month, point, overal, accord, report, publish, last, week, industri, one, see, steadi, growth, covid, pandem)"
2021-12-06T16:49:51.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,@medrxivpreprint Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA,"List(, medrxivpreprint, inequ, us, covid, vaccin, drive, medrxivpreprint, ucla, cornel, covid, coronaviru, covid, sarscov, vaccin, vaccin, us, usa)"
2021-12-06T16:49:41.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA https://t.co/kR9g0omEvs,"List(inequ, us, covid, vaccin, drive, medrxivpreprint, ucla, cornel, covid, coronaviru, covid, sarscov, vaccin, vaccin, us, usa)"
2021-12-06T16:41:08.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,New York's workers must all have vaccine by 27 December via @BBCNews https://t.co/myDGlI9nTb #US #NY #states #COVID19 #Omicron #DeltaVariant #vaccines #economy #economics #workers #wages #families #Politics #politicalparties #RegionalSecurity #recovery,"List(new, york, worker, must, vaccin, decemb, via, bbcnew, us, ny, state, covid, omicron, deltavari, vaccin, economi, econom, worker, wage, famili, polit, politicalparti, regionalsecur, recoveri)"
2021-12-06T16:27:40.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"Update on the 7-day Avg. of new #covid19 Deaths reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of deaths are: #Missouri #Michigan #California #Pennsylvania #Texas #omicron #coronavirus #Illinois #NewJersey https://t.co/luOYgczOx2","List(updat, day, avg, new, covid, death, report, unitedst, state, yesterday, top, state, high, day, averag, death, missouri, michigan, california, pennsylvania, texa, omicron, coronaviru, illinoi, newjersey)"
2021-12-06T16:21:56.000+0000,(#US OR #UnitedStates) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"Update on the 7-day Avg. of new #covid19 cases reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of cases: #Michigan #NewYork #Ohio #Pennsylvania #Illinois #omicron #coronavirus #covid_19 #Georgia #Arizona https://t.co/qgY5aq5JRB","List(updat, day, avg, new, covid, case, report, unitedst, state, yesterday, top, state, high, day, averag, case, michigan, newyork, ohio, pennsylvania, illinoi, omicron, coronaviru, covid, georgia, arizona)"


In [0]:
df2_pred = pipelineModel.transform(df2)
display(df2_pred.select("text", "prediction"))
df2_pred = df2_pred.select("text", col("prediction").cast(DoubleType()))

display(df2_pred.groupBy("prediction").count())

text,prediction
#Zoonosis Scientists have evidence #SARSCoV2 spreads explosively in white-tailed #deer + that the virus is widespread in this deer population across #US. Scientists say this could have vast implications for long-term course of the #pandemic. #BeVegan https://t.co/1d27EW63wb,0.0
New COVID-19 Variant Omicron: Latest US Travel Restrictions 🇺🇸 #omicron #COVID19 #UnitedStates #travel #TravelRestrictions https://t.co/eQl7mZciYw,1.0
CDC: 99% of Cases in the US are from Delta variant. #DeltaVariant #CDC #US #COVID19 https://t.co/gDOqSr0BC0,1.0
"The #US, hit hard by the #pandemic, is engineering a switch to #telehealth which, historically, has only been used by self-paying patients. US private health #insurers & even the various Medicaid programs - allow more reimbursement for telehealth. #ACFView https://t.co/PXqkhKubig https://t.co/VLIZrcWlVx",2.0
#Manufacturing in the #US grew in the month of #November by a margin of .3 points over the previous month and of 11 points overall according to a #report published last week. The industry is one of a few to see steady growth through the #covid19 pandemic. https://t.co/uWM96rVUQi https://t.co/8O41hZrVX0,2.0
@medrxivpreprint Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA,2.0
Inequalities of the US COVID-19 vaccination drive https://t.co/3iLtkXV4OP @medrxivpreprint @UCLA @Cornell #COVID19 #coronavirus #covid #SARSCoV2 #vaccine #vaccination #US #USA https://t.co/kR9g0omEvs,1.0
New York's workers must all have vaccine by 27 December via @BBCNews https://t.co/myDGlI9nTb #US #NY #states #COVID19 #Omicron #DeltaVariant #vaccines #economy #economics #workers #wages #families #Politics #politicalparties #RegionalSecurity #recovery,0.0
"Update on the 7-day Avg. of new #covid19 Deaths reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of deaths are: #Missouri #Michigan #California #Pennsylvania #Texas #omicron #coronavirus #Illinois #NewJersey https://t.co/luOYgczOx2",1.0
"Update on the 7-day Avg. of new #covid19 cases reported by #UnitedStates states. https://t.co/eKdLIypHFk As at yesterday, top states with high 7-day average of cases: #Michigan #NewYork #Ohio #Pennsylvania #Illinois #omicron #coronavirus #covid_19 #Georgia #Arizona https://t.co/qgY5aq5JRB",1.0


prediction,count
0.0,187
1.0,230
2.0,126


In [0]:
df3 = spark.read.option("wholeFile", True)\
                .option("multiline", True)\
                .option("header", True)\
                .option("escape","\"")\
                .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
                .csv("/FileStore/tables/India_Covid.csv")\


df3 = df3.dropna()
display(df3)

text,created_at,search_query
#India achieves new milestone as over 50% of adult population are now fully vaccinated against #covid19 WATCH to know more! https://t.co/sUHLBrDleX,2021-12-06T17:56:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
Union Health Minister #MansukhMandaviya on Monday said that 85 per cent of #India's eligible adult population has received the first dose of #Covid19vaccine. #Covid19 https://t.co/XT0ARyZKWA,2021-12-06T17:53:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
#covidvariant #COVID19 #Covid #India wake up https://t.co/h6SSVdHqoX,2021-12-06T17:40:31.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
Health Minister Mansukh Mandaviya: 4.6 lakh people died to COVID-19 in India. #MansukhMandaviya #COVID19 #India https://t.co/KySp7QmPW2,2021-12-06T17:30:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"While the #Covid19 pandemic led to disruptions in prevention campaigns against malaria, #India -- one of the highest burden countries globally -- reported a reduction in the malaria burden between 2019 and 2020, #malaria https://t.co/csDjuRdUag",2021-12-06T17:19:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
Agenda of the Russian Ministers in India https://t.co/H72BOjrDpc via @DiplomaticIns #Russia #India #COVID19 #RussiaIndia #DruzhbaDosti @rajnathsingh @KremlinRussia_E @DrSJaishankar @mfa_russia @narendramodi @KremlinRussia_E @MEAIndia @PMOIndia @RusEmbIndia @IndEmbMoscow,2021-12-06T17:14:53.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"This is the only way #India has remained safe, because of #goodsamaritans be it pothole, be it #COVID19 Let's keep the spirit high, come together, forget differences, think wise and take India ahead Hats off to this guy and the kids https://t.co/5CNhfnYsYU",2021-12-06T16:51:37.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
"Continuous #foreignfund outflows as well as concerns over impact of the new variant of #Covid19 on growth, dragged #India's key #equity market #indices into the red on Monday. #OmicronVirus #Omicronindia https://t.co/Zkb9UziKEQ",2021-12-06T16:37:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
SOVEREIGN TOURS Tour and Travelling Check Our Exclusive Packages https://t.co/8XvZ9xSyjI . . . . . #tour #trip #travel #staysafe #leisure #vacations #holiday #photography #india #travelstory #travelagain #sovereigntours #travelpackages #tourism #staysafe #coronavirus #covid19 https://t.co/4N3EHVnfb2,2021-12-06T16:33:40.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets
COVID-19 cases are expected to increase by 15 January 2022: G. Srinivas Rao #coronavirus #India #OmicronVirus #COVID19 #Omicronindia #covidvariant #OmicronVarient #Hyderabad #Telangana #COVID @HiHyderabad @HydWatch @swachhhyd @Hyderabadiiiiii @viralvideovlogs @GuineeTags https://t.co/a2DcjHMPSI,2021-12-06T16:30:01.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets


In [0]:
# remove links
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, "\'", ''))
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, '\s+', ' '))

# tokenize the text into words
df3 = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df3)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df3 = df3.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df3 = df3.withColumn("words_stem", udf_stem("words_cleaned"))


display(df3)

df3 = df3.select("created_at", "search_query", "text", "words_stem")

display(df3)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


text,created_at,search_query,text_cleaned,words,words_cleaned,words_stem
#India achieves new milestone as over 50% of adult population are now fully vaccinated against #covid19 WATCH to know more! https://t.co/sUHLBrDleX,2021-12-06T17:56:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,India achieves new milestone as over of adult population are now fully vaccinated against covid WATCH to know more,"List(, india, achieves, new, milestone, as, over, of, adult, population, are, now, fully, vaccinated, against, covid, watch, to, know, more)","List(, india, achieves, new, milestone, adult, population, fully, vaccinated, covid, watch, know)","List(, india, achiev, new, mileston, adult, popul, fulli, vaccin, covid, watch, know)"
Union Health Minister #MansukhMandaviya on Monday said that 85 per cent of #India's eligible adult population has received the first dose of #Covid19vaccine. #Covid19 https://t.co/XT0ARyZKWA,2021-12-06T17:53:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Union Health Minister MansukhMandaviya on Monday said that per cent of Indias eligible adult population has received the first dose of Covid vaccine Covid,"List(union, health, minister, mansukhmandaviya, on, monday, said, that, per, cent, of, indias, eligible, adult, population, has, received, the, first, dose, of, covid, vaccine, covid)","List(union, health, minister, mansukhmandaviya, monday, said, per, cent, indias, eligible, adult, population, received, first, dose, covid, vaccine, covid)","List(union, health, minist, mansukhmandaviya, monday, said, per, cent, india, elig, adult, popul, receiv, first, dose, covid, vaccin, covid)"
#covidvariant #COVID19 #Covid #India wake up https://t.co/h6SSVdHqoX,2021-12-06T17:40:31.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,covidvariant COVID Covid India wake up,"List(, covidvariant, covid, covid, india, wake, up)","List(, covidvariant, covid, covid, india, wake)","List(, covidvari, covid, covid, india, wake)"
Health Minister Mansukh Mandaviya: 4.6 lakh people died to COVID-19 in India. #MansukhMandaviya #COVID19 #India https://t.co/KySp7QmPW2,2021-12-06T17:30:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Health Minister Mansukh Mandaviya lakh people died to COVID in India MansukhMandaviya COVID India,"List(health, minister, mansukh, mandaviya, lakh, people, died, to, covid, in, india, mansukhmandaviya, covid, india)","List(health, minister, mansukh, mandaviya, lakh, people, died, covid, india, mansukhmandaviya, covid, india)","List(health, minist, mansukh, mandaviya, lakh, peopl, die, covid, india, mansukhmandaviya, covid, india)"
"While the #Covid19 pandemic led to disruptions in prevention campaigns against malaria, #India -- one of the highest burden countries globally -- reported a reduction in the malaria burden between 2019 and 2020, #malaria https://t.co/csDjuRdUag",2021-12-06T17:19:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,While the Covid pandemic led to disruptions in prevention campaigns against malaria India one of the highest burden countries globally reported a reduction in the malaria burden between and malaria,"List(while, the, covid, pandemic, led, to, disruptions, in, prevention, campaigns, against, malaria, india, one, of, the, highest, burden, countries, globally, reported, a, reduction, in, the, malaria, burden, between, and, malaria)","List(covid, pandemic, led, disruptions, prevention, campaigns, malaria, india, one, highest, burden, countries, globally, reported, reduction, malaria, burden, malaria)","List(covid, pandem, led, disrupt, prevent, campaign, malaria, india, one, highest, burden, countri, global, report, reduct, malaria, burden, malaria)"
Agenda of the Russian Ministers in India https://t.co/H72BOjrDpc via @DiplomaticIns #Russia #India #COVID19 #RussiaIndia #DruzhbaDosti @rajnathsingh @KremlinRussia_E @DrSJaishankar @mfa_russia @narendramodi @KremlinRussia_E @MEAIndia @PMOIndia @RusEmbIndia @IndEmbMoscow,2021-12-06T17:14:53.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Agenda of the Russian Ministers in India via DiplomaticIns Russia India COVID RussiaIndia DruzhbaDosti rajnathsingh KremlinRussia E DrSJaishankar mfa russia narendramodi KremlinRussia E MEAIndia PMOIndia RusEmbIndia IndEmbMoscow,"List(agenda, of, the, russian, ministers, in, india, via, diplomaticins, russia, india, covid, russiaindia, druzhbadosti, rajnathsingh, kremlinrussia, e, drsjaishankar, mfa, russia, narendramodi, kremlinrussia, e, meaindia, pmoindia, rusembindia, indembmoscow)","List(agenda, russian, ministers, india, via, diplomaticins, russia, india, covid, russiaindia, druzhbadosti, rajnathsingh, kremlinrussia, e, drsjaishankar, mfa, russia, narendramodi, kremlinrussia, e, meaindia, pmoindia, rusembindia, indembmoscow)","List(agenda, russian, minist, india, via, diplomaticin, russia, india, covid, russiaindia, druzhbadosti, rajnathsingh, kremlinrussia, e, drsjaishankar, mfa, russia, narendramodi, kremlinrussia, e, meaindia, pmoindia, rusembindia, indembmoscow)"
"This is the only way #India has remained safe, because of #goodsamaritans be it pothole, be it #COVID19 Let's keep the spirit high, come together, forget differences, think wise and take India ahead Hats off to this guy and the kids https://t.co/5CNhfnYsYU",2021-12-06T16:51:37.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,This is the only way India has remained safe because of goodsamaritans be it pothole be it COVID Lets keep the spirit high come together forget differences think wise and take India ahead Hats off to this guy and the kids,"List(this, is, the, only, way, india, has, remained, safe, because, of, goodsamaritans, be, it, pothole, be, it, covid, lets, keep, the, spirit, high, come, together, forget, differences, think, wise, and, take, india, ahead, hats, off, to, this, guy, and, the, kids)","List(way, india, remained, safe, goodsamaritans, pothole, covid, lets, keep, spirit, high, come, together, forget, differences, think, wise, take, india, ahead, hats, guy, kids)","List(way, india, remain, safe, goodsamaritan, pothol, covid, let, keep, spirit, high, come, togeth, forget, differ, think, wise, take, india, ahead, hat, guy, kid)"
"Continuous #foreignfund outflows as well as concerns over impact of the new variant of #Covid19 on growth, dragged #India's key #equity market #indices into the red on Monday. #OmicronVirus #Omicronindia https://t.co/Zkb9UziKEQ",2021-12-06T16:37:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Continuous foreignfund outflows as well as concerns over impact of the new variant of Covid on growth dragged Indias key equity market indices into the red on Monday OmicronVirus Omicronindia,"List(continuous, foreignfund, outflows, as, well, as, concerns, over, impact, of, the, new, variant, of, covid, on, growth, dragged, indias, key, equity, market, indices, into, the, red, on, monday, omicronvirus, omicronindia)","List(continuous, foreignfund, outflows, well, concerns, impact, new, variant, covid, growth, dragged, indias, key, equity, market, indices, red, monday, omicronvirus, omicronindia)","List(continu, foreignfund, outflow, well, concern, impact, new, variant, covid, growth, drag, india, key, equiti, market, indic, red, monday, omicronviru, omicronindia)"
SOVEREIGN TOURS Tour and Travelling Check Our Exclusive Packages https://t.co/8XvZ9xSyjI . . . . . #tour #trip #travel #staysafe #leisure #vacations #holiday #photography #india #travelstory #travelagain #sovereigntours #travelpackages #tourism #staysafe #coronavirus #covid19 https://t.co/4N3EHVnfb2,2021-12-06T16:33:40.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,SOVEREIGN TOURS Tour and Travelling Check Our Exclusive Packages tour trip travel staysafe leisure vacations holiday photography india travelstory travelagain sovereigntours travelpackages tourism staysafe coronavirus covid,"List(sovereign, tours, tour, and, travelling, check, our, exclusive, packages, tour, trip, travel, staysafe, leisure, vacations, holiday, photography, india, travelstory, travelagain, sovereigntours, travelpackages, tourism, staysafe, coronavirus, covid)","List(sovereign, tours, tour, travelling, check, exclusive, packages, tour, trip, travel, staysafe, leisure, vacations, holiday, photography, india, travelstory, travelagain, sovereigntours, travelpackages, tourism, staysafe, coronavirus, covid)","List(sovereign, tour, tour, travel, check, exclus, packag, tour, trip, travel, staysaf, leisur, vacat, holiday, photographi, india, travelstori, travelagain, sovereigntour, travelpackag, tourism, staysaf, coronaviru, covid)"
COVID-19 cases are expected to increase by 15 January 2022: G. Srinivas Rao #coronavirus #India #OmicronVirus #COVID19 #Omicronindia #covidvariant #OmicronVarient #Hyderabad #Telangana #COVID @HiHyderabad @HydWatch @swachhhyd @Hyderabadiiiiii @viralvideovlogs @GuineeTags https://t.co/a2DcjHMPSI,2021-12-06T16:30:01.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,COVID cases are expected to increase by January G Srinivas Rao coronavirus India OmicronVirus COVID Omicronindia covidvariant OmicronVarient Hyderabad Telangana COVID HiHyderabad HydWatch swachhhyd Hyderabadiiiiii viralvideovlogs GuineeTags,"List(covid, cases, are, expected, to, increase, by, january, g, srinivas, rao, coronavirus, india, omicronvirus, covid, omicronindia, covidvariant, omicronvarient, hyderabad, telangana, covid, hihyderabad, hydwatch, swachhhyd, hyderabadiiiiii, viralvideovlogs, guineetags)","List(covid, cases, expected, increase, january, g, srinivas, rao, coronavirus, india, omicronvirus, covid, omicronindia, covidvariant, omicronvarient, hyderabad, telangana, covid, hihyderabad, hydwatch, swachhhyd, hyderabadiiiiii, viralvideovlogs, guineetags)","List(covid, case, expect, increas, januari, g, sriniva, rao, coronaviru, india, omicronviru, covid, omicronindia, covidvari, omicronvari, hyderabad, telangana, covid, hihyderabad, hydwatch, swachhhyd, hyderabadiiiiii, viralvideovlog, guineetag)"


created_at,search_query,text,words_stem
2021-12-06T17:56:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,#India achieves new milestone as over 50% of adult population are now fully vaccinated against #covid19 WATCH to know more! https://t.co/sUHLBrDleX,"List(, india, achiev, new, mileston, adult, popul, fulli, vaccin, covid, watch, know)"
2021-12-06T17:53:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Union Health Minister #MansukhMandaviya on Monday said that 85 per cent of #India's eligible adult population has received the first dose of #Covid19vaccine. #Covid19 https://t.co/XT0ARyZKWA,"List(union, health, minist, mansukhmandaviya, monday, said, per, cent, india, elig, adult, popul, receiv, first, dose, covid, vaccin, covid)"
2021-12-06T17:40:31.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,#covidvariant #COVID19 #Covid #India wake up https://t.co/h6SSVdHqoX,"List(, covidvari, covid, covid, india, wake)"
2021-12-06T17:30:25.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Health Minister Mansukh Mandaviya: 4.6 lakh people died to COVID-19 in India. #MansukhMandaviya #COVID19 #India https://t.co/KySp7QmPW2,"List(health, minist, mansukh, mandaviya, lakh, peopl, die, covid, india, mansukhmandaviya, covid, india)"
2021-12-06T17:19:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"While the #Covid19 pandemic led to disruptions in prevention campaigns against malaria, #India -- one of the highest burden countries globally -- reported a reduction in the malaria burden between 2019 and 2020, #malaria https://t.co/csDjuRdUag","List(covid, pandem, led, disrupt, prevent, campaign, malaria, india, one, highest, burden, countri, global, report, reduct, malaria, burden, malaria)"
2021-12-06T17:14:53.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,Agenda of the Russian Ministers in India https://t.co/H72BOjrDpc via @DiplomaticIns #Russia #India #COVID19 #RussiaIndia #DruzhbaDosti @rajnathsingh @KremlinRussia_E @DrSJaishankar @mfa_russia @narendramodi @KremlinRussia_E @MEAIndia @PMOIndia @RusEmbIndia @IndEmbMoscow,"List(agenda, russian, minist, india, via, diplomaticin, russia, india, covid, russiaindia, druzhbadosti, rajnathsingh, kremlinrussia, e, drsjaishankar, mfa, russia, narendramodi, kremlinrussia, e, meaindia, pmoindia, rusembindia, indembmoscow)"
2021-12-06T16:51:37.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"This is the only way #India has remained safe, because of #goodsamaritans be it pothole, be it #COVID19 Let's keep the spirit high, come together, forget differences, think wise and take India ahead Hats off to this guy and the kids https://t.co/5CNhfnYsYU","List(way, india, remain, safe, goodsamaritan, pothol, covid, let, keep, spirit, high, come, togeth, forget, differ, think, wise, take, india, ahead, hat, guy, kid)"
2021-12-06T16:37:00.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,"Continuous #foreignfund outflows as well as concerns over impact of the new variant of #Covid19 on growth, dragged #India's key #equity market #indices into the red on Monday. #OmicronVirus #Omicronindia https://t.co/Zkb9UziKEQ","List(continu, foreignfund, outflow, well, concern, impact, new, variant, covid, growth, drag, india, key, equiti, market, indic, red, monday, omicronviru, omicronindia)"
2021-12-06T16:33:40.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,SOVEREIGN TOURS Tour and Travelling Check Our Exclusive Packages https://t.co/8XvZ9xSyjI . . . . . #tour #trip #travel #staysafe #leisure #vacations #holiday #photography #india #travelstory #travelagain #sovereigntours #travelpackages #tourism #staysafe #coronavirus #covid19 https://t.co/4N3EHVnfb2,"List(sovereign, tour, tour, travel, check, exclus, packag, tour, trip, travel, staysaf, leisur, vacat, holiday, photographi, india, travelstori, travelagain, sovereigntour, travelpackag, tourism, staysaf, coronaviru, covid)"
2021-12-06T16:30:01.000+0000,(#India) AND (#Covid19 OR #Virus OR #Pandemic) -filter:retweets,COVID-19 cases are expected to increase by 15 January 2022: G. Srinivas Rao #coronavirus #India #OmicronVirus #COVID19 #Omicronindia #covidvariant #OmicronVarient #Hyderabad #Telangana #COVID @HiHyderabad @HydWatch @swachhhyd @Hyderabadiiiiii @viralvideovlogs @GuineeTags https://t.co/a2DcjHMPSI,"List(covid, case, expect, increas, januari, g, sriniva, rao, coronaviru, india, omicronviru, covid, omicronindia, covidvari, omicronvari, hyderabad, telangana, covid, hihyderabad, hydwatch, swachhhyd, hyderabadiiiiii, viralvideovlog, guineetag)"


In [0]:
df3_pred = pipelineModel.transform(df3)
display(df3_pred.select("text", "prediction"))
df3_pred = df3_pred.select("text", col("prediction").cast(DoubleType()))

display(df3_pred.groupBy("prediction").count())

text,prediction
#India achieves new milestone as over 50% of adult population are now fully vaccinated against #covid19 WATCH to know more! https://t.co/sUHLBrDleX,2.0
Union Health Minister #MansukhMandaviya on Monday said that 85 per cent of #India's eligible adult population has received the first dose of #Covid19vaccine. #Covid19 https://t.co/XT0ARyZKWA,1.0
#covidvariant #COVID19 #Covid #India wake up https://t.co/h6SSVdHqoX,1.0
Health Minister Mansukh Mandaviya: 4.6 lakh people died to COVID-19 in India. #MansukhMandaviya #COVID19 #India https://t.co/KySp7QmPW2,0.0
"While the #Covid19 pandemic led to disruptions in prevention campaigns against malaria, #India -- one of the highest burden countries globally -- reported a reduction in the malaria burden between 2019 and 2020, #malaria https://t.co/csDjuRdUag",0.0
Agenda of the Russian Ministers in India https://t.co/H72BOjrDpc via @DiplomaticIns #Russia #India #COVID19 #RussiaIndia #DruzhbaDosti @rajnathsingh @KremlinRussia_E @DrSJaishankar @mfa_russia @narendramodi @KremlinRussia_E @MEAIndia @PMOIndia @RusEmbIndia @IndEmbMoscow,2.0
"This is the only way #India has remained safe, because of #goodsamaritans be it pothole, be it #COVID19 Let's keep the spirit high, come together, forget differences, think wise and take India ahead Hats off to this guy and the kids https://t.co/5CNhfnYsYU",2.0
"Continuous #foreignfund outflows as well as concerns over impact of the new variant of #Covid19 on growth, dragged #India's key #equity market #indices into the red on Monday. #OmicronVirus #Omicronindia https://t.co/Zkb9UziKEQ",2.0
SOVEREIGN TOURS Tour and Travelling Check Our Exclusive Packages https://t.co/8XvZ9xSyjI . . . . . #tour #trip #travel #staysafe #leisure #vacations #holiday #photography #india #travelstory #travelagain #sovereigntours #travelpackages #tourism #staysafe #coronavirus #covid19 https://t.co/4N3EHVnfb2,1.0
COVID-19 cases are expected to increase by 15 January 2022: G. Srinivas Rao #coronavirus #India #OmicronVirus #COVID19 #Omicronindia #covidvariant #OmicronVarient #Hyderabad #Telangana #COVID @HiHyderabad @HydWatch @swachhhyd @Hyderabadiiiiii @viralvideovlogs @GuineeTags https://t.co/a2DcjHMPSI,1.0


prediction,count
0.0,353
1.0,441
2.0,206
