In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('UseCase_2').getOrCreate()

In [4]:
sc = spark.sparkContext

In [5]:
from pyspark.sql.functions import desc,col

### Travel Dataset

#### DataFrame

In [6]:
df = spark.read.format('csv').options(header=False, inferSchema=False, delimiter='\t').load('TravelData.txt')

In [7]:
df.show(2)

+-------+---+---+---+---+---+---+---+---+--------------------+--------------------+----+----+----------------+----+----+----+--------------------+
|    _c0|_c1|_c2|_c3|_c4|_c5|_c6|_c7|_c8|                 _c9|                _c10|_c11|_c12|            _c13|_c14|_c15|_c16|                _c17|
+-------+---+---+---+---+---+---+---+---+--------------------+--------------------+----+----+----------------+----+----+----+--------------------+
|ZIH-ZIH|ZIH|ZIH|  4|  2|  0|  0|  0|  0|2014-10-23 00:00:...|2014-10-25 00:00:...|   0|   0|2003.19995117188|null|null|null| Viceroy Zihuatanejo|
|ZIH-ZIH|ZIH|ZIH|  4|  2|  0|  0|  0|  0|2014-10-23 00:00:...|2014-10-25 00:00:...|   0|   0|         1556.76|null|null|null|Capella Ixtapa Re...|
+-------+---+---+---+---+---+---+---+---+--------------------+--------------------+----+----+----------------+----+----+----+--------------------+
only showing top 2 rows



#### RDD

In [9]:
travel_file = sc.textFile('TravelData.txt')

In [18]:
travel_file.collect()

['ZIH-ZIH\tZIH\tZIH\t4\t2\t0\t0\t0\t0\t2014-10-23 00:00:00.000\t2014-10-25 00:00:00.000\t0\t0\t2003.19995117188\t\t\t\tViceroy Zihuatanejo',
 'ZIH-ZIH\tZIH\tZIH\t4\t2\t0\t0\t0\t0\t2014-10-23 00:00:00.000\t2014-10-25 00:00:00.000\t0\t0\t1556.76\t\t\t\tCapella Ixtapa Resort & Spa',
 'YYZ-YYZ\tYYZ\tYYZ\t4\t1\t2\t0\t0\t0\t2014-11-24 00:00:00.000\t2014-11-26 00:00:00.000\t0\t0\t268.02\t\t\t\tHilton Hotel and Suites Niagara Falls/Fallsview',
 'YYZ-YYZ\tYYZ\tYYZ\t4\t1\t2\t0\t0\t0\t2014-11-24 00:00:00.000\t2014-11-26 00:00:00.000\t0\t0\t674.36\t\t\t\tHilton Hotel and Suites Niagara Falls/Fallsview',
 'YYZ-YYZ\tYYZ\tYYZ\t2\t0\t1\t0\t0\t0\t2014-11-11 08:00:00.000\t2014-11-14 08:00:00.000\t0\t254.19\t0\t\t\tZL\t',
 'YYZ-YYZ\tYYZ\tYYZ\t4\t1\t0\t0\t0\t0\t2015-02-16 00:00:00.000\t2015-02-22 00:00:00.000\t0\t0\t916.46\t\t\t\tHilton Suites Conference Centre and Spa',
 'YYC-YYC\tYYC\tYYC\t4\t1\t0\t0\t0\t0\t2014-02-03 00:00:00.000\t2014-02-08 00:00:00.000\t0\t0\t903.36\t\t\t\tDelta Bow Valley',
 'YYC-YY

#### Problem Statement 1

##### Top 20 destination people travel the most: Based on the given data, we can find the most popular destination that people travel frequently. There are many destinations out of which we will find only first 20, based on trips booked for particular destinations.

In [19]:
df.groupBy('_c2').count().select(col('_c2').alias('destination'),'count').sort(desc('count')).show(20)

+-----------+-----+
|destination|count|
+-----------+-----+
|        MIA|  396|
|        SFO|  290|
|        LAS|  202|
|        LAX|  162|
|        DFW|  102|
|        DEN|   64|
|        ORD|   57|
|        PHL|   54|
|        IAH|   50|
|        JFK|   45|
|        PHX|   44|
|        FLL|   40|
|        ATL|   36|
|        MCO|   31|
|        BOS|   31|
|        SAN|   27|
|        WAS|   25|
|        CUN|   24|
|        AUS|   22|
|        LON|   22|
+-----------+-----+
only showing top 20 rows



In [None]:
travel_file.map(lambda lines: lines.split('\t')).first()

#### Problem Statement 2


##### Top 20 locations from where people travel the most: We can find the places from where most of the trips are undertaken, based on the booked trip count.

In [57]:
df.groupBy('_c1').count().select(col('_c1').alias('source'),'count').sort(desc('count')).show(20)

+------+-----+
|source|count|
+------+-----+
|   DFW|  504|
|   MIA|  293|
|   LAS|  272|
|   BOM|  167|
|   SFO|  131|
|   ORD|  101|
|   LAX|   72|
|   DEN|   55|
|   PHL|   41|
|   IAH|   37|
|   FLL|   35|
|   PHX|   33|
|   JFK|   31|
|   WAS|   24|
|   HOU|   19|
|   ATL|   19|
|   DXB|   18|
|   BCN|   17|
|   BOS|   17|
|   SAN|   17|
+------+-----+
only showing top 20 rows



#### Problem Statement 3

##### Top 20 cities that generate high airline revenues for travel, so that the site can concentrate on offering discount on booking, to those cities to attract more bookings.

In [58]:
df.filter(df['_c3'] == 1).groupBy('_c2').count().select(col('_c2').alias('destination'),'count').sort(desc('count')).show(20)

+-----------+-----+
|destination|count|
+-----------+-----+
|        MIA|   84|
|        SFO|   68|
|        LAS|   54|
|        LAX|   42|
|        IAH|   24|
|        DFW|   23|
|        PHX|   18|
|        BOS|   17|
|        ORD|   15|
|        NYC|   13|
|        DCA|    9|
|        WAS|    8|
|        AUS|    8|
|        DEN|    7|
|        MEM|    7|
|        JFK|    7|
|        SYD|    6|
|        PHL|    6|
|        ATL|    6|
|        BKK|    5|
+-----------+-----+
only showing top 20 rows



### Clossing spark session

In [None]:
spark.stop()