In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf,lit,current_timestamp, explode
from pyspark.sql.types import ArrayType,StructType,StructField,StringType,IntegerType,LongType,DoubleType,TimestampType,DecimalType,FloatType
import requests, json
import datetime as datetime
from pyspark.sql import Row
import pytz as pytz

In [8]:
def executeRestApi(verb, url, headers, body):    
  res = None
  # Make API request, get response object back, create dataframe from above schema.
  try:
    if verb == "get":
      res = requests.get(url, data=body, headers=headers, timeout=15)
    else:
      res = requests.post(url, data=body, headers=headers)
#  except Exception as e:
#    return e
  except requests.exceptions.RequestException as e:  # This is the correct syntax
      raise SystemExit(e)
  if res != None and res.status_code == 200:
    return json.loads(res.text)
  return None

In [9]:
# StringType            ShortType
# ArrayType             IntegerType
# MapType               LongType
# StructType            FloatType
# DateType              DoubleType
# TimestampType         DecimalType
# BooleanType           ByteType
# CalendarIntervalType  HiveStringType
# BinaryType            ObjectType
# NumericType           NullType

schema = StructType([  
  StructField("start", StringType(), True),
  StructField("end", StringType(), True),
  StructField("list", ArrayType(
    StructType([
      StructField("country", StringType()),
      StructField("name", StringType()),
      StructField("releaseDate", StringType()),
      StructField("issue", StringType()),
      StructField("produce", StringType()),
      StructField("theaterCount", IntegerType()),
      StructField("tickets", LongType()),
      StructField("ticketChangeRate", FloatType()),
      StructField("amounts", LongType()),
      StructField("totalTickets", LongType()),
      StructField("totalAmounts", LongType())
    ])
  ))
])

udf_executeRestApi = udf(executeRestApi, schema)

tw = pytz.timezone('Asia/Taipei')

headers = {
    'content-type': "application/json"
}

body = json.dumps({
})

In [10]:
#sc = spark.sparkContext

spark = SparkSession \
  .builder \
  .appName("Movie REST test") \
  .master("local[1]") \
  .config("spark.driver.memory","8G") \
  .getOrCreate()

sc=spark.sparkContext
sc.setLogLevel('INFO')
spark.conf.set("spark.sql.debug.maxToStringFields", 100000)
spark.conf.set("spark.sql.session.timeZone", "UTC+8")

In [11]:
print("start time: ", datetime.datetime.now(tw))

RestApiRequest = Row("verb", "url", "headers", "body")

request_df = spark.createDataFrame([
            RestApiRequest("get", "https://boxoffice.tfi.org.tw/api/export?start=2022/10/21&end=2022/10/28", headers, body)
          ])\
          .withColumn("execute", udf_executeRestApi(col("verb"), col("url"), col("headers"), col("body")))

start time:  2022-10-30 23:11:20.211307+08:00


In [12]:
print(request_df)

DataFrame[verb: string, url: string, headers: map<string,string>, body: string, execute: struct<start:string,end:string,list:array<struct<country:string,name:string,releaseDate:string,issue:string,produce:string,theaterCount:int,tickets:bigint,ticketChangeRate:float,amounts:bigint,totalTickets:bigint,totalAmounts:bigint>>>]


In [16]:
print("start time: ", datetime.datetime.now(tw))

request_df.select(explode(col("execute.list")).alias("list"))\
    .select(col("list.country"), col("list.name")).show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:13:00.116309+08:00
+--------+-----------------------+
| country|                   name|
+--------+-----------------------+
|    法國|               全面開戰|
|中華民國|                 阮玲玉|
|    美國|             齊瓦哥醫生|
|    美國|             達賴的一生|
|    法國|         仰望星空的少年|
|中華民國|                 願未央|
|中華民國|                 兜兜風|
|    日本|       神隱少女(日文版)|
|  加拿大|               沉默呼聲|
|    美國|     小小兵2:格魯的崛起|
|    香港|               神探大戰|
|    美國|               回歸野性|
|    美國|             最後的冰川|
|    美國|       捍衛戰士: 獨行俠|
|中華民國|               孫行者傳|
|    印尼|         娜娜：逝水年華|
|    日本|名偵探柯南 萬聖節的新娘|
|    美國|       巴斯光年(英文版)|
|    日本|     只是現在不走運而已|
|    印度|         偉大的印度廚房|
+--------+-----------------------+
only showing top 20 rows

end   time:  2022-10-30 23:13:06.374921+08:00


In [None]:
spark.stop()