In [1]:
#import findspark, pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from datetime import datetime
from functools import reduce
from pyspark.sql.functions import col

#sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

file_attack = 'gtd_14to17_0718dist'
file_airport = 'airports-extended'
file_largest_airport = 'largest-global-airports-passenger-traffic'
file_passengers = 'avia_par_be'

# Fetch Data

In [2]:
#data_path = 'D:/DataMining/taba/'
data_path = '../data/Datasets/'

attack_data = sc.textFile(data_path + 'gtd-data/' + file_attack + '.csv')
airport_data = sc.textFile(data_path + file_airport + '.csv')
largest_airport_data = sc.textFile(data_path + file_largest_airport + '.xlsx')
passenger_data = sc.textFile(data_path + file_passengers + '.xls')

In [329]:
# ALT WAY DOES NOT WORK YET QUESTION
#df2 = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(data_path + 'gtd-data/' + file_attack + '.csv')
#df2.printSchema()
#df2.select(df2.country_txt).show(1)
#df2.select(col('country_txt')).show(1)
# https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/
# => fill null values in dataframe -> fillna()

# Prepare Attack Data

In [330]:
# Define columns for dataframe
new_columns = attack_data.first()
new_columns = new_columns.split(";")
attack_data = attack_data.filter(lambda l: l != new_columns)
df_terror_data = attack_data.map(lambda x: x.split(';')).toDF()
old_columns = df_terror_data.schema.names

In [331]:
# apply new column names
df_terror_data = reduce(lambda data, idx: data.withColumnRenamed(old_columns[idx], new_columns[idx]), range(len(old_columns)), df_terror_data)

# Query Attack Data

In [332]:
#query_result = df_terror_data.filter(col("country_txt") == "Belgium").select('iyear', 'imonth', 'iday','country_txt', 'summary')
query_result = df_terror_data.filter(df_terror_data.country_txt == "Belgium").select('iyear', 'imonth', 'iday','country_txt', 'summary')

In [333]:
query_result.show(10)

+-----+------+----+-----------+--------------------+
|iyear|imonth|iday|country_txt|             summary|
+-----+------+----+-----------+--------------------+
| 2014|     5|  24|    Belgium|05/24/2014: Assai...|
| 2014|     9|  16|    Belgium|09/16/2014: Assai...|
| 2016|     3|  22|    Belgium|03/22/2016: Two s...|
| 2016|     3|  22|    Belgium|03/22/2016: A sui...|
| 2016|     8|   6|    Belgium|08/06/2016: An as...|
| 2016|     8|  29|    Belgium|08/29/2016: Assai...|
| 2016|    10|   5|    Belgium|10/05/2016: An as...|
| 2016|    12|  23|    Belgium|12/23/2016: Secur...|
| 2017|     5|   1|    Belgium|"05/01/2017: An a...|
| 2017|     6|  20|    Belgium|"06/20/2017: A su...|
+-----+------+----+-----------+--------------------+
only showing top 10 rows



In [334]:
query_result.select('summary').show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|summary                                                                                                                                                                                                                                                                      

# Prepare airport_data

In [19]:
fields = []
fields.append(StructField('airport_id', IntegerType(), True)) 
fields.append(StructField('airport_name', StringType(), True))
fields.append(StructField('city', StringType(), True))
fields.append(StructField('country', StringType(), True))
fields.append(StructField('IATA', StringType(), True)) # kind of airport identifier
fields.append(StructField('ICAO', StringType(), True)) # different kind of airport identifier
fields.append(StructField('latitude', FloatType(), True))
fields.append(StructField('longitude', FloatType(), True))
fields.append(StructField('unknown1', IntegerType(), True))
fields.append(StructField('unknown2', StringType(), True))
fields.append(StructField('unknown3', StringType(), True))
fields.append(StructField('continent_city', StringType(), True))
fields.append(StructField('idk_column', StringType(), True))
fields.append(StructField('source', StringType(), True))
schema = StructType(fields)

clean_data = (airport_data
              .map(lambda line: line.split(';'))
              .map(lambda line: [int(line[0]),line[1],line[2],line[3],line[4],line[5],float(line[6]),float(line[7]),int(line[8]),line[9],line[10],line[11],line[12],line[13]])
             )
df_airport_data = sqlContext.createDataFrame(clean_data, schema)
#df_airport_data = sqlContext.createDataFrame(airport_data.map(lambda line: line.split(';')), schema)

[[1, 'Goroka Airport', 'Goroka', 'Papua New Guinea', 'GKA', 'AYGA', -6.08168983459, 145.391998291, 5282, '10', 'U', 'Pacific/Port_Moresby', 'airport', 'OurAirports'], [2, 'Madang Airport', 'Madang', 'Papua New Guinea', 'MAG', 'AYMD', -5.20707988739, 145.789001465, 20, '10', 'U', 'Pacific/Port_Moresby', 'airport', 'OurAirports'], [3, 'Mount Hagen Kagamuga Airport', 'Mount Hagen', 'Papua New Guinea', 'HGU', 'AYMH', -5.82678985595703, 144.296005249023, 5388, '10', 'U', 'Pacific/Port_Moresby', 'airport', 'OurAirports'], [4, 'Nadzab Airport', 'Nadzab', 'Papua New Guinea', 'LAE', 'AYNZ', -6.569803, 146.725977, 239, '10', 'U', 'Pacific/Port_Moresby', 'airport', 'OurAirports']]


# Query airport_data

In [17]:
(df_airport_data
 .select(df_airport_data.airport_name, df_airport_data.city,df_airport_data.continent_city)
 .filter(df_airport_data.continent_city == 'Europe/Brussels')
 .show(df_airport_data.count(),False))

+--------------------------------------+-----------------+---------------+
|airport_name                          |city             |continent_city |
+--------------------------------------+-----------------+---------------+
|Antwerp International Airport (Deurne)|Antwerp          |Europe/Brussels|
|Beauvechain Air Base                  |Beauvechain      |Europe/Brussels|
|Kleine Brogel Air Base                |Kleine Brogel    |Europe/Brussels|
|Brussels Airport                      |Brussels         |Europe/Brussels|
|Jehonville Air Base                   |Bertrix          |Europe/Brussels|
|Brussels South Charleroi Airport      |Charleroi        |Europe/Brussels|
|Chièvres Air Base                     |Chievres         |Europe/Brussels|
|Koksijde Air Base                     |Koksijde         |Europe/Brussels|
|Florennes Air Base                    |Florennes        |Europe/Brussels|
|Wevelgem Airport                      |Kortrijk-vevelgem|Europe/Brussels|
|Liège Airport           

# Prepare passenger data

## Cleaning

In [337]:
international_passengers_file =  'international-passengers-'
international_passengers_2013 = sc.textFile(data_path + international_passengers_file + '2013' + '.csv')
international_passengers_2014 = sc.textFile(data_path + international_passengers_file + '2014' + '.csv')
#international_passengers_2015 = sc.textFile(data_path + international_passengers_file + '2015' + '.csv')

total_passengers_file = 'total-passengers-'
total_passengers_2013 = sc.textFile(data_path + total_passengers_file + '2013' + '.csv')
total_passengers_2014 = sc.textFile(data_path + total_passengers_file + '2014' + '.csv')
#total_passengers_2015 = sc.textFile(data_path + total_passengers_file + '2015' + '.csv')

def clean_passenger_data(rdd):
    
    rdd = rdd.map(lambda l: l.split(";"))
    

    new_columns = rdd.map(lambda l: l[3].split("/"))

    rdd = rdd.map(lambda l: l[0:3] + l[4:])

    rdd = rdd.zip(new_columns)

    rdd = rdd.map(lambda l: l[0] + l[1])
    
    
    print(rdd.take(2))
    print()
    print("+++++++++++++++++++++++++++++++++")
    print()

In [344]:
all_passenger_datasets_except2015 = [international_passengers_2013,international_passengers_2014,total_passengers_2013, total_passengers_2014]

for dataset in all_passenger_datasets_except2015:
    clean_passenger_data(dataset)
print(international_passengers_2013.take(2)) # QUESTION no pointers?

[['Rank', 'Airport', 'Location', 'Total passengers', '% change', 'IATA', 'ICAO'], ['1', 'London Heathrow Airport', 'Hillingdon, Greater London, United Kingdom', '66689466', '4%', 'LHR', 'EGLL']]

+++++++++++++++++++++++++++++++++

[['Rank', 'Airport', 'Location', 'Total passengers', '% change', 'IATA', 'ICAO'], ['1', 'Dubai International Airport', 'Garhoud, Dubai, United Arab Emirates', '69954392', '6.2%', 'DXB', 'OMDB']]

+++++++++++++++++++++++++++++++++

[['Rank', 'Airport', 'Location', ' Total passengers ', '% change', 'IATA', 'ICAO'], ['1', 'Hartsfield–Jackson Atlanta International Airport', 'Atlanta, Georgia, United States', '94430785', '1%', 'ATL', 'KATL']]

+++++++++++++++++++++++++++++++++

[['Rank', 'Airport', 'Location', 'IATA/ICAO', 'Total passengers', '% change', 'Country'], ['1', 'Hartsfield–Jackson Atlanta International Airport', 'Atlanta, Georgia', 'ATL/KATL', '96178899', '1.9%', 'United States']]

+++++++++++++++++++++++++++++++++

['Rank;Airport;Location;IATA/ICAO;Tot

## creating dataframe

### fix the data from 2015 missing ICAO column

In [339]:
fields = []
fields.append(StructField('rank', StringType(), True))
fields.append(StructField('airport_name', StringType(), True))
fields.append(StructField('location', StringType(), True))
fields.append(StructField('country', StringType(), True))
fields.append(StructField('total_passengers', StringType(), True))
fields.append(StructField('change', StringType(), True))
fields.append(StructField('IATA', StringType(), True))
schema_total_passengers_2015 = StructType(fields)

fields = []
fields.append(StructField('rank', StringType(), True))
fields.append(StructField('airport_name', StringType(), True))
fields.append(StructField('location', StringType(), True))
fields.append(StructField('total_passengers', StringType(), True))
fields.append(StructField('change', StringType(), True))
fields.append(StructField('IATA', StringType(), True))
schema_international_passengers_2015 = StructType(fields)

In [340]:
total_passengers_2015 = sc.textFile(data_path + total_passengers_file + '2015' + '.csv')
international_passengers_2015 = sc.textFile(data_path + international_passengers_file + '2015' + '.csv')


# remove header
header = total_passengers_2015.first()
total_passengers_2015 = total_passengers_2015.filter(lambda l: l != header)
# create dataframe
df_total_passengers_2015 = sqlContext.createDataFrame(total_passengers_2015.map(lambda l: l.split(";")),schema_total_passengers_2015)
# add missing column by join using previous data (df_airport_data)
df_total_passengers_2015 = df_total_passengers_2015.join(df_airport_data.select(df_airport_data.IATA, df_airport_data.ICAO),"IATA")

# repeat for second dataset from 2015
international_passengers_2015 = sc.textFile(data_path + international_passengers_file + '2015' + '.csv')
header = international_passengers_2015.first()
international_passengers_2015 = international_passengers_2015.filter(lambda l: l != header)
temp = international_passengers_2015.map(lambda l: l.split(";"))
df_international_passengers_2015 = sqlContext.createDataFrame(international_passengers_2015.map(lambda l: l.split(";")),schema_international_passengers_2015)
df_international_passengers_2015 = df_international_passengers_2015.join(df_airport_data.select(df_airport_data.IATA, df_airport_data.ICAO),"IATA")

# print results
print("total passenger data 2015")
df_total_passengers_2015.show(2)
print()
print("++++++++++++++++++++++++++++++")
print()
print("international passenger data 2015")
df_international_passengers_2015.show(2)

total passenger data 2015
+----+----+--------------------+---------+-------+----------------+------+----+
|IATA|rank|        airport_name| location|country|total_passengers|change|ICAO|
+----+----+--------------------+---------+-------+----------------+------+----+
| FRA|  12|Flughafen Frankfu...|Frankfurt|Germany|        61032022|   2.5|EDDF|
| IST|  11|Atatürk Internati...| Istanbul| Turkey|        61346229|   8.2|LTBA|
+----+----+--------------------+---------+-------+----------------+------+----+
only showing top 2 rows


++++++++++++++++++++++++++++++

international passenger data 2015
+----+----+--------------------+-----------------+----------------+------+----+
|IATA|rank|        airport_name|         location|total_passengers|change|ICAO|
+----+----+--------------------+-----------------+----------------+------+----+
| PMI|  36|Aeropuerto de Pal...|Palma De Mallorca|        18107070|   0.6|LEPA|
| HEL|  48|Helsinki-Vantaa A...|         Helsinki|        13826868|   2.9|EFHK|
+-

### make the dataframes for the other datasets

In [342]:
fields = []
fields.append(StructField('rank', StringType(), True))
fields.append(StructField('airport_name', StringType(), True))
fields.append(StructField('location', StringType(), True))
fields.append(StructField('total_passengers', StringType(), True))
fields.append(StructField('change', StringType(), True))
fields.append(StructField('IATA', StringType(), True))
fields.append(StructField('ICAO', StringType(), True))
schema_passengers_data = StructType(fields)

temp = international_passengers_2013.map(lambda l: l.split(";"))
print(temp.take(2))
df_international_passengers_2013 = sqlContext.createDataFrame(international_passengers_2013.map(lambda l: l.split(";")), schema_passengers_data)
df_international_passengers_2014 = sqlContext.createDataFrame(international_passengers_2014.map(lambda l: l.split(";")), schema_passengers_data)
df_total_passengers_2013 = sqlContext.createDataFrame(total_passengers_2013.map(lambda l: l.split(";")), schema_passengers_data)
df_total_passengers_2014 = sqlContext.createDataFrame(total_passengers_2014.map(lambda l: l.split(";")), schema_passengers_data)


datasets = [df_international_passengers_2013,df_international_passengers_2014,df_total_passengers_2013,df_total_passengers_2014]
for df in datasets:
    print(df.show(2))
    print()
    print("+++++++++++++++++++++++++++++++++")
    print()

[['Rank', 'Airport', 'Location', 'IATA/ICAO', 'Total passengers', '% change'], ['1', 'London Heathrow Airport', 'Hillingdon, Greater London, United Kingdom', 'LHR/EGLL', '66689466', '4%']]


Py4JJavaError: An error occurred while calling o10850.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 650.0 failed 1 times, most recent failure: Lost task 0.0 in stage 650.0 (TID 1155, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "/opt/spark/python/pyspark/sql/session.py", line 673, in prepare
    verify_func(obj)
  File "/opt/spark/python/pyspark/sql/types.py", line 1421, in verify
    verify_value(obj)
  File "/opt/spark/python/pyspark/sql/types.py", line 1400, in verify_struct
    "length of fields (%d)" % (len(obj), len(verifiers))))
ValueError: Length of object (6) does not match with length of fields (7)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1872)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.GeneratedMethodAccessor99.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "/opt/spark/python/pyspark/sql/session.py", line 673, in prepare
    verify_func(obj)
  File "/opt/spark/python/pyspark/sql/types.py", line 1421, in verify
    verify_value(obj)
  File "/opt/spark/python/pyspark/sql/types.py", line 1400, in verify_struct
    "length of fields (%d)" % (len(obj), len(verifiers))))
ValueError: Length of object (6) does not match with length of fields (7)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


# Prepare passenger_data