In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySparkApp").master("local[*]").getOrCreate()
from pyspark.sql.functions import col,count,avg, month, year, to_date, current_date, when
spark

In [2]:
customer_df = spark.read.csv("Uber/Uber/Customer_table.csv", header = True, inferSchema = True)

In [3]:
customer_df.show(5)

+---------------+--------------+-----------+-------------------+-------------+-------+---------------+---------------+-------------+---------+------------+---------------+--------------+----------------------+---------+----------------+------------+--------------+----------------+--------+-----------+
|Pickup DateTime| Drop DateTime|Driver Name|Driver Phone Number|Trip Distance|Trip ID|Pickup Location|  Drop Location|Trip Duration|Trip Fare|Vehicle Type|   Trip Purpose|Passenger Name|Passenger Phone Number|Scheduled|Passenger Rating|Driver Rated|Payment Method|Payment Discount|Net Fare|Trip Status|
+---------------+--------------+-----------+-------------------+-------------+-------+---------------+---------------+-------------+---------+------------+---------------+--------------+----------------------+---------+----------------+------------+--------------+----------------+--------+-----------+
| 1/1/2016 21:11|1/1/2016 21:17|     Almire|         9298608912|           21| 318886|    F

## 1. No. Of Customers taking trip from the source location 
#### Required Columns : Pickup Location , Count of the number of trips

In [9]:
# Count number of customers taking trip from the same pickup location
customer_df.groupBy() \
    .agg() \
    .orderBy() \
    .show(truncate=False)


+-----------------+---------------+
|Pickup Location  |Number_of_Trips|
+-----------------+---------------+
|Fort Pierce      |108            |
|Midtown          |78             |
|West Palm Beach  |54             |
|Cary             |52             |
|Lower Manhattan  |26             |
|Midtown East     |26             |
|Flatiron District|26             |
|East Harlem      |26             |
|Hudson Square    |26             |
|Jamaica          |26             |
|New York         |26             |
|Elmhurst         |26             |
+-----------------+---------------+



## 2. Top 10 longest location distance travelled by customers
#### Required Columns : Passenger Name, Pickup Location, Drop Location, Trip Distance

In [10]:
longest_trips = customer_df\
                .select()\
                .orderBy()

longest_trips.show(10, truncate=False)

+--------------+---------------+-------------+-------------+
|Passenger Name|Pickup Location|Drop Location|Trip Distance|
+--------------+---------------+-------------+-------------+
|Price         |East Harlem    |Whitebridge  |80           |
|Darlleen      |Fort Pierce    |Cary         |80           |
|Virginie      |Elmhurst       |Cary         |80           |
|Charlena      |Midtown East   |Durham       |80           |
|Haskel        |Cary           |Whitebridge  |80           |
|Jacky         |West Palm Beach|Houston      |80           |
|Collette      |Fort Pierce    |Tanglewood   |80           |
|Moss          |Lower Manhattan|Morrisville  |80           |
|Elsy          |New York       |Cary         |79           |
|Daron         |Fort Pierce    |Cary         |79           |
+--------------+---------------+-------------+-------------+
only showing top 10 rows



## 3. Trips where the passenger rating was less than 3.1 and the trip fare was over $40.
#### Required Columns : Passenger Name,Trip Fare,Passenger Rating

customer_df\
    .filter()\
    .select()\
    .show()

## 4. Top 3 most common pickup and drop location pairs.

In [12]:
customer_df.groupBy()\
            .agg()\
            .orderBy()\
            .show()


+---------------+-------------+--------+
|Pickup Location|Drop Location|count(1)|
+---------------+-------------+--------+
|        Midtown|         Cary|      18|
|    Fort Pierce|         Cary|      16|
|           Cary|         Cary|      14|
+---------------+-------------+--------+
only showing top 3 rows



## 5. Check for undercharging ( Fare less than 10 and distance more than five miles)

In [43]:
customer_df\
    .filter()\
    .select()\
    .show()


+---------------+-------------+---------+-------------+
|Pickup Location|Drop Location|Trip Fare|Trip Distance|
+---------------+-------------+---------+-------------+
|           Cary|         Cary|     7.84|           68|
|West Palm Beach|         Cary|      7.5|           53|
|    Fort Pierce|      Tribeca|      7.5|           66|
+---------------+-------------+---------+-------------+

