In [1]:
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from awsglue.transforms import Map, DropFields, ApplyMapping
from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
import sqlite3 

sc = SparkContext()
context = GlueContext(sc)

In [2]:
path = "s3://network.cubo.datalake/cubonetwork/raw/prod/2023-03-16-14-30-07/insights-events/data/"
dataframe = context.create_dynamic_frame.from_options(
                connection_type='s3',
                connection_options={
                    'paths': [path],
                    'recurse': True
                },
                format_options={
                'jsonPath': '$.Item',
                },
                format='json'
            )

In [3]:
dataframe.printSchema()

root
|-- id: struct
|    |-- S: string
|-- action: struct
|    |-- S: string
|-- data: struct
|    |-- M: struct
|    |    |-- terms: struct
|    |    |    |-- S: string
|    |    |-- countOnConnectionsRanking: struct
|    |    |    |-- BOOL: boolean
|    |    |-- sector: struct
|    |    |    |-- S: string
|    |    |-- startup: struct
|    |    |    |-- S: string
|    |    |-- categoryData: struct
|    |    |    |-- S: string
|    |    |-- companyId: struct
|    |    |    |-- S: string
|    |    |-- challengeId: struct
|    |    |    |-- S: string
|    |    |-- corporate: struct
|    |    |    |-- S: string
|    |    |-- challengeTitle: struct
|    |    |    |-- S: string
|    |    |-- startupId: struct
|    |    |    |-- S: string
|    |    |-- startupImageUrl: struct
|    |    |    |-- S: string
|    |    |-- challengeCorporateId: struct
|    |    |    |-- S: string
|    |    |-- challengeData: struct
|    |    |    |-- N: string
|    |    |-- startupName: struct
|    |    |    |--

In [4]:
from pyspark.sql.functions import sum, col, desc
td = TypeDeserializer()
def recDDB(item):
    deserialized_document = {k: td.deserialize(v) for k, v in item.items()}
    return deserialized_document

newDf2 = dataframe.map(f = recDDB)

In [5]:
newDf2.printSchema()

root
|-- action: string
|-- data: struct
|    |-- countOnConnectionsRanking: boolean
|    |-- terms: string
|    |-- sector: string
|    |-- startup: string
|    |-- categoryData: string
|    |-- companyId: string
|    |-- challengeId: string
|    |-- challengeTitle: string
|    |-- corporate: string
|    |-- startupId: string
|    |-- startupImageUrl: string
|    |-- startupName: string
|    |-- startupSlug: string
|    |-- challengeData: decimal
|    |-- challengeCorporateId: string
|    |-- subCategoryData: string
|    |-- pipelineId: string
|    |-- partner: string
|    |-- library: string
|    |-- profileUid: string
|    |-- origin: string
|    |-- job: string
|    |-- case: string
|    |-- investor: string
|    |-- founderId: string
|    |-- corporateId: string
|-- user: string
|-- category: string
|-- label: string
|-- createdAt: decimal
|-- id: string



In [None]:
newDf3.printSchema()

In [6]:
def recInsights(item):
    try:
        if "data" in item:

            if "startup" in item["data"] and "corporate" in item["data"]:
                del item["data"]["startup"]
                del item["data"]["corporate"]

            if "startup" in item["data"]:
                item["data"]["companyId"] = item["data"]["startup"]
                del item["data"]["startup"]

            if "startupId" in item["data"]:
                item["data"]["companyId"] = item["data"]["startupId"]
                del item["data"]["startupId"]

            if "corporateId" in item["data"]:
                item["data"]["companyId"] = item["data"]["corporateId"]
                del item["data"]["corporateId"]

            if "corporate" in item["data"]:
                item["data"]["companyId"] = item["data"]["corporate"]
                del item["data"]["corporate"]

            if "partner" in item["data"]:
                item["data"]["companyId"] = item["data"]["partner"]
                del item["data"]["partner"]

            if "profileUid" in item["data"]:
                item["profileUid"] = item["data"]["profileUid"]
                del item["data"]["profileUid"]

            if "companyId" in item["data"]:
                item["companyIdTo"] = item["data"]["companyId"]

    except:
        return item

    return item

newDf4 = newDf2.map(f=recInsights)  

In [7]:
newDf4.printSchema()

root
|-- action: string
|-- companyIdTo: string
|-- data: struct
|    |-- companyId: string
|    |-- countOnConnectionsRanking: boolean
|    |-- terms: string
|    |-- sector: string
|    |-- categoryData: string
|    |-- startupImageUrl: string
|    |-- challengeTitle: string
|    |-- startupName: string
|    |-- startupSlug: string
|    |-- challengeData: decimal
|    |-- challengeCorporateId: string
|    |-- challengeId: string
|    |-- pipelineId: string
|    |-- origin: string
|    |-- job: string
|    |-- subCategoryData: string
|    |-- investor: string
|    |-- library: string
|    |-- founderId: string
|    |-- case: string
|-- user: string
|-- category: string
|-- label: string
|-- createdAt: decimal
|-- id: string
|-- profileUid: string



In [9]:
newDf4 = dataframe.map(f = recDDB).resolveChoice([("createdAt", "cast:string")]) \
    .rename_field("label", "type")

In [10]:
newDf4.printSchema()

root
|-- action: string
|-- type: string
|-- data: struct
|    |-- startup: string
|    |-- countOnConnectionsRanking: boolean
|    |-- terms: string
|    |-- startupId: string
|    |-- companyId: string
|    |-- sector: string
|    |-- corporate: string
|    |-- categoryData: string
|    |-- startupImageUrl: string
|    |-- challengeTitle: string
|    |-- startupName: string
|    |-- startupSlug: string
|    |-- challengeData: decimal
|    |-- challengeCorporateId: string
|    |-- challengeId: string
|    |-- pipelineId: string
|    |-- origin: string
|    |-- profileUid: string
|    |-- job: string
|    |-- partner: string
|    |-- subCategoryData: string
|    |-- investor: string
|    |-- library: string
|    |-- founderId: string
|    |-- case: string
|    |-- corporateId: string
|-- category: string
|-- user: string
|-- createdAt: string
|-- id: string

