In [297]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-11-22
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-11-22
@Title : Fetch and Process Tweet items from DynamoDB table using PySpark

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-11-22\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-11-22\n@Title : Fetch and Process Tweet items from DynamoDB table using PySpark\n\n'

In [298]:
import findspark
findspark.init() 

In [299]:
import json
import boto3
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [300]:
import os
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get('test1_access_key')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.environ.get('test1_secret_access_key')
KINESIS_DATA_STREAM = 'kds-twitter-sda'
TABLE_NAME = 'WorldCup2022Tweets'

In [301]:
spark = SparkSession.builder.master("local").appName("Reading DynamoDB items into RDD ").getOrCreate()
print(spark.sparkContext.appName)

Reading DynamoDB items into RDD 


### 1. Reading items from DynamoDB Table into a list of dictionaries

In [302]:
dyDB =  boto3.resource('dynamodb')   

table = dyDB.Table(TABLE_NAME)

table_details = table.scan()
table_items = table_details['Items']

list_of_dict = []
# appending all items/dictionaries from the table to a list
for item in table_items:
    # print(item)
    list_of_dict.append(item)

In [303]:
spark_df = spark.createDataFrame(list_of_dict)
spark_df.show(10)



+--------------------+--------------------+-------------------+--------------------+-------------------+---------------+--------------------+-----------------------+
|    ArrivalTimestamp|        display_name|           tweet_id|          tweet_text|            user_id|      user_name|  word_count_on_text|word_occurences_on_text|
+--------------------+--------------------+-------------------+--------------------+-------------------+---------------+--------------------+-----------------------+
|2022-11-22 12:51:...|                Rags|1594954296491409409|@INSIGHTUK2 Even ...|1456973849082224646|      rhalharvi|19.00000000000000...|   19.00000000000000...|
|2022-11-22 12:54:...|         Andy Newton|1594954908859596800|World Cup Betting...|           68403698| NewtsDailyLays|16.00000000000000...|   16.00000000000000...|
|2022-11-22 12:50:...|        MODI FOREVER|1594953972326227968|RT @gnuseibeh: Qa...|          163162895| royprateek2010|24.00000000000000...|   24.00000000000000...|
|202

In [304]:
spark_df.describe().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------------------+
|summary|    ArrivalTimestamp|        display_name|            tweet_id|          tweet_text|             user_id|    user_name|  word_count_on_text|word_occurences_on_text|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------------------+
|  count|                1004|                1004|                1004|                1004|                1004|         1004|                1004|                   1004|
|   mean|                null|                null|1.594953288752600...|                null|1.082297965066966...|         null|21.49302788844621...|   21.49302788844621...|
| stddev|                null|                null|1.034527046427790...|                null|5.975100107108737...|         null|  

In [305]:
spark_df.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------------------+
|summary|    ArrivalTimestamp|        display_name|            tweet_id|          tweet_text|             user_id|    user_name|  word_count_on_text|word_occurences_on_text|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------------------+
|  count|                1004|                1004|                1004|                1004|                1004|         1004|                1004|                   1004|
|   mean|                null|                null|1.594953288752600...|                null|1.082297965066966...|         null|21.49302788844621...|   21.49302788844621...|
| stddev|                null|                null|1.034527046427790...|                null|5.975100107108737...|         null|  

### 2. Data Processing

#### 2a. Dropping last column 'word_occurences_on_text'

In [306]:
# Dropping last column
spark_df = spark_df.drop("word_occurences_on_text")
spark_df.printSchema()

root
 |-- ArrivalTimestamp: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- word_count_on_text: decimal(38,18) (nullable = true)



#### 2b. Changing data type of column 'word_count_on_text' from decimal to int

In [307]:
# Changing data types of 'word count on text' from decimal to integer
spark_df = spark_df.withColumn("word_count_on_text",spark_df.word_count_on_text.cast('int'))
spark_df.printSchema()

root
 |-- ArrivalTimestamp: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- word_count_on_text: integer (nullable = true)



#### 2c.  Removing the milliseconds part of values under column 'ArrivalTimestamp'

In [308]:
# Removing the milliseconds part of ArrivalTimestamp
spark_df = spark_df.withColumn('Timestamp', substring('ArrivalTimestamp', 1,19))
spark_df.printSchema()
spark_df.select('ArrivalTimestamp','Timestamp').show(10, truncate = False)

root
 |-- ArrivalTimestamp: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- word_count_on_text: integer (nullable = true)
 |-- Timestamp: string (nullable = true)

+--------------------------------+-------------------+
|ArrivalTimestamp                |Timestamp          |
+--------------------------------+-------------------+
|2022-11-22 12:51:58.145000+05:30|2022-11-22 12:51:58|
|2022-11-22 12:54:24.462000+05:30|2022-11-22 12:54:24|
|2022-11-22 12:50:40.345000+05:30|2022-11-22 12:50:40|
|2022-11-22 12:46:47.176000+05:30|2022-11-22 12:46:47|
|2022-11-22 12:41:56.082000+05:30|2022-11-22 12:41:56|
|2022-11-22 12:52:24.573000+05:30|2022-11-22 12:52:24|
|2022-11-22 12:48:40.395000+05:30|2022-11-22 12:48:40|
|2022-11-22 12:50:59.093000+05:30|2022-11-22 12:50:59|
|2022-11-22 12:44:58.143000+05:30|2022-11-2

#### 2d. Dropping ArrivalTimestamp column, 
#### reordering columns and 
#### then order DF by Timestamp and then tweet_id

In [309]:
# Dropping ArrivalTimestamp column, reordering columns and then orders DF by Timestamp and then tweet_id
# spark_df = spark_df.drop("ArrivalTimestamp", "Timestamp")
spark_df = spark_df.drop("ArrivalTimestamp")
spark_df = spark_df.withColumnRenamed("Timestamp","ArrivalTimestamp")
spark_df = spark_df.select("tweet_id","ArrivalTimestamp","display_name","user_name","user_id","tweet_text","word_count_on_text")
spark_df = spark_df.sort(col("ArrivalTimestamp"),col("tweet_id"))
spark_df.printSchema()
spark_df.show()

root
 |-- tweet_id: string (nullable = true)
 |-- ArrivalTimestamp: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)
 |-- word_count_on_text: integer (nullable = true)

+-------------------+-------------------+----------------------------------+---------------+-------------------+--------------------+------------------+
|           tweet_id|   ArrivalTimestamp|                      display_name|      user_name|            user_id|          tweet_text|word_count_on_text|
+-------------------+-------------------+----------------------------------+---------------+-------------------+--------------------+------------------+
|1594951526619176961|2022-11-22 12:40:57|                           resa_ds|      leyla_677| 965136351455715328|RT @AmnestyIran: ...|                23|
|1594951523317981184|2022-11-22 12:40:58|                              O.G.| 

#### 2e. Extract all hastags from a tweet, 
#### count all hastags from a tweet and
#### Derive sentiment from a tweet

In [310]:
# Custom Function To return all the hashtags from a tweet text in a list

def extract_hashtags(text):
    """

    Description:
        To extract hashtags from tweet texts.
    Parameter:
        string text
    Return:
        list hashtag_records

    """
    hashtag_list = []

    # splitting the text into words
    for word in text.split():
    
        # checking the first character of every word
        if word[0] == '#':
            
            # adding the word to the hashtag_list
            hashtag_list.append(word[1:])
    
    # printing the hashtag_list
    hashtag_records = []
    print("The hashtags in \"" + text + "\" are :")
    for hashtag in hashtag_list:

        hashtag = '#'+hashtag
        print(hashtag)
        hashtag_records.append(hashtag)

    return hashtag_records


In [311]:
# Using comprehend AWS Service to derive sentiment from a tweet text.
comprehend = boto3.client('comprehend')

# creating a new dictionary of 3 keys 'hashtags_from_text', 'hashtags_count' and 'sentiment'
hash_list = []
hash_list_wc = []
sentiment = []
for i in spark_df.select("tweet_text").collect():
    # print(type(list(i)))
    hashtag_records = extract_hashtags(' '.join(list(i)))
    hashtag_string = ' '.join(hashtag_records)
    hash_list.append(hashtag_string)
    hash_list_wc.append(len(hashtag_string.strip().split()))
    sentiment.append(comprehend.detect_sentiment(Text=' '.join(list(i)),LanguageCode='en')['Sentiment'])

new_col = {}
new_col['hashtags_from_text'] = hash_list
new_col['hashtags_count'] = hash_list_wc
new_col['sentiment'] = sentiment

# build pandas DataFrame from spark_df
pandas_df = spark_df.toPandas() 
# build a new pandas df from new_col dict

new_col_pandas_df = pd.DataFrame(new_col)

# Join the new and old pandas df. 
new_and_old_pandas_df = new_col_pandas_df.join(pandas_df)

# convert the joined pandas DataFrame to spark DataFrame
spark_df = spark.createDataFrame(new_and_old_pandas_df)

# Order the columns of the DF accordingly
spark_df = spark_df.select("tweet_id","ArrivalTimestamp","display_name","user_name","user_id","tweet_text","word_count_on_text","hashtags_from_text","hashtags_count","sentiment")

# print schema of updated spark_df
spark_df.printSchema()

The hashtags in "RT @AmnestyIran: As the world's attention is turned to the #ENGIRN game, let's remember the faces of children killed by Iran's security for…" are :
#ENGIRN
The hashtags in "I’m sorry, but #Latto’s version of #BlickBlick sucks! 😩🤦🏽‍♂️ Although, her nor #CoiLeray wrote the #rap. 😒 #Rappers #FemaleRappers #HipHop #HipHopEd #HipHopNews #Queen #QueenRadio #WorldCup2022 #FIFAWorldCup2022 #NickiMinaj #HipHopBeats #DWTSFinale #SENNED #MillieMilkers #USAWAL https://t.co/aGM2TnlwIi" are :
#Latto’s
#BlickBlick
#CoiLeray
#rap.
#Rappers
#FemaleRappers
#HipHop
#HipHopEd
#HipHopNews
#Queen
#QueenRadio
#WorldCup2022
#FIFAWorldCup2022
#NickiMinaj
#HipHopBeats
#DWTSFinale
#SENNED
#MillieMilkers
#USAWAL
The hashtags in "Isalamic Republic and IRGC have sent armed troops to #Kurdistan, #Mahabad, #JavanRood, #Kermanshah, #Piranshahr and are using Heavy weapons against unarmed civilians to silence their voice. Numbers have been killed and injured.
#WorldCup
#WorldCup2022 
#FIFAWorldCup2022 h

  for column, series in pdf.iteritems():


root
 |-- tweet_id: string (nullable = true)
 |-- ArrivalTimestamp: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- tweet_text: string (nullable = true)
 |-- word_count_on_text: long (nullable = true)
 |-- hashtags_from_text: string (nullable = true)
 |-- hashtags_count: long (nullable = true)
 |-- sentiment: string (nullable = true)



### 3. To send Spark Dataframe to KDA for analysis

#### 3a. Convert Spark Dataframe into a list of dictionaries

In [314]:
list_of_dict = spark_df.rdd.map(lambda row: row.asDict()).collect()
count = 0

#### 3b. Connect and send list of dictionaries to Kinesis Data Analytics

In [328]:
kinesis_client = boto3.client('kinesis')
counter = 0

for r in list_of_dict:
    # Send message to Kinesis DataStream

    response = kinesis_client.put_record(
        StreamName = KINESIS_DATA_STREAM,
        Data = json.dumps(r),
        PartitionKey = str(hash(r['tweet_id']))
    )
    counter = counter + 1
    print('Message sent #' + str(counter))
    
    # If the message was not sucessfully sent print an error message
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        print('Error!')
        print(response)

Message sent #1
Message sent #2
Message sent #3
Message sent #4
Message sent #5
Message sent #6
Message sent #7
Message sent #8
Message sent #9
Message sent #10
Message sent #11
Message sent #12
Message sent #13
Message sent #14
Message sent #15
Message sent #16
Message sent #17
Message sent #18
Message sent #19
Message sent #20
Message sent #21
Message sent #22
Message sent #23
Message sent #24
Message sent #25
Message sent #26
Message sent #27
Message sent #28
Message sent #29
Message sent #30
Message sent #31
Message sent #32
Message sent #33
Message sent #34
Message sent #35
Message sent #36
Message sent #37
Message sent #38
Message sent #39
Message sent #40
Message sent #41
Message sent #42
Message sent #43
Message sent #44
Message sent #45
Message sent #46
Message sent #47
Message sent #48
Message sent #49
Message sent #50
Message sent #51
Message sent #52
Message sent #53
Message sent #54
Message sent #55
Message sent #56
Message sent #57
Message sent #58
Message sent #59
Messag