In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

test_file_path = "/FileStore/tables/07_2020.csv"

# Define Schema
schema = StructType([StructField('created_at', TimestampType(), True),
                     StructField('file_name', StringType(), True),
                     StructField('followers', IntegerType(), True),
                     StructField('friends', IntegerType(), True),
                     StructField('group_name', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('retweet_count', IntegerType(), True),
                     StructField('screenname', StringType(), True),
                     StructField('search_query', StringType(), True),
                     StructField('text', StringType(), True),
                     StructField('twitter_id', StringType(), True),
                     StructField('username', StringType(), True),
                     StructField('polarity', StringType(), True),                  
                     StructField('partition_0', StringType(), True),      
                     StructField('partition_1', StringType(), True)])

df = spark.read.csv(test_file_path, header=True, schema=schema)

# convert polarity to float
df = df.filter(col("polarity").cast(FloatType()).isNotNull()).withColumn("polarity", col("polarity").cast(FloatType()))
# remove polarities above 1 and below -1
df = df.filter("polarity < 1 and polarity > -1")

display(df)


created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1
2020-07-12T03:45:47.000+0000,Japan,417,508,Japan,,1,shoma8400,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,1282159193835794432,蔵/(shoma),0.0,Politics,Japan
2020-07-12T03:44:41.000+0000,Japan,10615,9793,Japan,"Florida, USA",167,vicky_whedbee,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,1282158916785315840,Vicky Whedbee,0.1779,Politics,Japan
2020-07-12T03:44:13.000+0000,Japan,27979,24395,Japan,,167,AaronGritsch,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,1282158797507702784,Aaron Gritsch,0.1779,Politics,Japan
2020-07-12T03:43:32.000+0000,Japan,1582,4136,Japan,,0,realkquisstuff,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,1282158627541798912,Wildglow By: Tatio Active Dx,0.0,Politics,Japan
2020-07-12T03:43:16.000+0000,Japan,225,553,Japan,for now here,0,Ojarumalu_Princ,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,1282158557396262912,Ojalu_Tak,0.0,Politics,Japan
2020-07-12T03:42:12.000+0000,Japan,193,602,Japan,,1,SUCREMOMES,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,1282158291955613703,Luna2592012,0.0,Politics,Japan
2020-07-12T03:41:55.000+0000,Japan,475,2313,Japan,Hell,2,aecyberdubai,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,1282158221122101250,APT,0.6597,Politics,Japan
2020-07-12T03:41:49.000+0000,Japan,914,2,Japan,,6,nlognbot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158193808990209,#100DaysOfCode,0.0,Politics,Japan
2020-07-12T03:41:47.000+0000,Japan,531,1,Japan,,6,theInfernobot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158186800263169,Inferno,0.0,Politics,Japan
2020-07-12T03:41:40.000+0000,Japan,2497,67,Japan,Earth,6,xaelbot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158155905077250,xael bot,0.0,Politics,Japan


In [0]:
df = df.filter(df["search_query"].rlike("^#.*$"))

display(df)

created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1
2020-07-12T03:45:47.000+0000,Japan,417,508,Japan,,1,shoma8400,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,1282159193835794432,蔵/(shoma),0.0,Politics,Japan
2020-07-12T03:44:41.000+0000,Japan,10615,9793,Japan,"Florida, USA",167,vicky_whedbee,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,1282158916785315840,Vicky Whedbee,0.1779,Politics,Japan
2020-07-12T03:44:13.000+0000,Japan,27979,24395,Japan,,167,AaronGritsch,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,1282158797507702784,Aaron Gritsch,0.1779,Politics,Japan
2020-07-12T03:43:32.000+0000,Japan,1582,4136,Japan,,0,realkquisstuff,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,1282158627541798912,Wildglow By: Tatio Active Dx,0.0,Politics,Japan
2020-07-12T03:43:16.000+0000,Japan,225,553,Japan,for now here,0,Ojarumalu_Princ,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,1282158557396262912,Ojalu_Tak,0.0,Politics,Japan
2020-07-12T03:42:12.000+0000,Japan,193,602,Japan,,1,SUCREMOMES,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,1282158291955613703,Luna2592012,0.0,Politics,Japan
2020-07-12T03:41:55.000+0000,Japan,475,2313,Japan,Hell,2,aecyberdubai,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,1282158221122101250,APT,0.6597,Politics,Japan
2020-07-12T03:41:49.000+0000,Japan,914,2,Japan,,6,nlognbot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158193808990209,#100DaysOfCode,0.0,Politics,Japan
2020-07-12T03:41:47.000+0000,Japan,531,1,Japan,,6,theInfernobot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158186800263169,Inferno,0.0,Politics,Japan
2020-07-12T03:41:40.000+0000,Japan,2497,67,Japan,Earth,6,xaelbot,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1282158155905077250,xael bot,0.0,Politics,Japan


In [0]:
display(df.select("search_query").distinct())

search_query
#Uruguay
#Anguilla
#Taiwan
#Djibouti
#Malta
#Ukraine
#Spain
#NewZealand OR #NZ
#Jordan
#Japan


In [0]:
display(df.groupBy("search_query").count())

search_query,count
#Uruguay,7382
#Anguilla,201
#Taiwan,20779
#Djibouti,949
#Malta,1452
#Ukraine,18157
#Spain,20179
#NewZealand OR #NZ,18615
#Jordan,2220
#Japan,22829


In [0]:
display(df.groupBy("search_query").sum("polarity"))

search_query,sum(polarity)
#Uruguay,1807.609319768846
#Anguilla,55.45279974490404
#Taiwan,3308.7053268870804
#Djibouti,74.48410012573004
#Malta,34.98589913453907
#Ukraine,-1597.318795230938
#Spain,1062.6908320002258
#NewZealand OR #NZ,2528.2244996877853
#Jordan,548.229112803936
#Japan,2827.465595792164
