In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf,lit,current_timestamp, explode
from pyspark.sql.types import ArrayType,StructType,StructField,StringType,IntegerType,LongType,DoubleType,TimestampType,DecimalType,FloatType
import requests, json
import datetime as datetime
from pyspark.sql import Row
import pytz as pytz

In [9]:
def executeRestApi(verb, url, headers, body):    
  res = None
  # Make API request, get response object back, create dataframe from above schema.
  try:
    if verb == "get":
      res = requests.get(url, data=body, headers=headers, timeout=5)
    else:
      res = requests.post(url, data=body, headers=headers)
#  except Exception as e:
#    return e
  except requests.exceptions.RequestException as e:  # This is the correct syntax
      raise SystemExit(e)
  if res != None and res.status_code == 200:
    return json.loads(res.text)
  return None

In [13]:
# StringType            ShortType
# ArrayType             IntegerType
# MapType               LongType
# StructType            FloatType
# DateType              DoubleType
# TimestampType         DecimalType
# BooleanType           ByteType
# CalendarIntervalType  HiveStringType
# BinaryType            ObjectType
# NumericType           NullType

schema = StructType([  
  StructField("start", StringType(), True),
  StructField("end", StringType(), True),
  StructField("list", ArrayType(
    StructType([
      StructField("country", StringType()),
      StructField("name", StringType()),
      StructField("releaseDate", StringType()),
      StructField("issue", StringType()),
      StructField("produce", StringType()),
      StructField("theaterCount", IntegerType()),
      StructField("tickets", LongType()),
      StructField("ticketChangeRate", FloatType()),
      StructField("amounts", LongType()),
      StructField("totalTickets", LongType()),
      StructField("totalAmounts", LongType())
    ])
  ))
])

udf_executeRestApi = udf(executeRestApi, schema)

tw = pytz.timezone('Asia/Taipei')

headers = {
    'content-type': "application/json"
}

body = json.dumps({
})

In [14]:
#sc = spark.sparkContext

spark = SparkSession \
  .builder \
  .appName("Movie REST test") \
  .master("local[2]") \
  .config("spark.driver.memory","8G") \
  .getOrCreate()

sc=spark.sparkContext
sc.setLogLevel('INFO')
spark.conf.set("spark.sql.debug.maxToStringFields", 100000)
spark.conf.set("spark.sql.session.timeZone", "UTC+8")

In [15]:
print("start time: ", datetime.datetime.now(tw))

RestApiRequest = Row("verb", "url", "headers", "body")

request_df = spark.createDataFrame([
            RestApiRequest("get", "https://boxoffice.tfi.org.tw/api/export?start=2022/10/02&end=2022/10/09", headers, body)
          ])\
          .withColumn("execute", udf_executeRestApi(col("verb"), col("url"), col("headers"), col("body")))

start time:  2022-10-11 20:48:17.453054+08:00


In [16]:
print(request_df)

DataFrame[verb: string, url: string, headers: map<string,string>, body: string, execute: struct<start:string,end:string,list:array<struct<country:string,name:string,releaseDate:string,issue:string,produce:string,theaterCount:int,tickets:bigint,ticketChangeRate:float,amounts:bigint,totalTickets:bigint,totalAmounts:bigint>>>]


In [19]:
request_df.select(explode(col("execute.list")).alias("list"))\
    .select(col("list.country"), col("list.name")).show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_1629/3105118817.py", line 11, in executeRestApi
SystemExit: HTTPSConnectionPool(host='boxoffice.tfi.org.tw', port=443): Read timed out. (read timeout=5)


In [None]:
spark.stop()