In [1]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
import pyspark.sql as sql
from pyspark.sql.functions import col

In [2]:
conf = SparkConf().setAppName("Bikes_analysis").setMaster('yarn')

In [3]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [4]:
tripData = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y H:m')\
.csv("trips.csv")

In [5]:
stationData = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y')\
.csv("stations.csv")

In [6]:
tripData.printSchema()
stationData.printSchema()

root
 |-- id: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- end_date: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- subscription_type: string (nullable = true)
 |-- zip_code: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)



Задание 1 - Найти велосипед с максимальным временем пробега

In [7]:
result = tripData.select(col("bike_id"), col("duration"))\
.groupBy(col("bike_id"))\
.sum("duration")\
.sort("sum(duration)", ascending=True)\
.limit(1)\
.show()

+-------+-------------+
|bike_id|sum(duration)|
+-------+-------------+
|    687|          111|
+-------+-------------+



Задание 2 - Найти наибольшее геодезическое расстояние между станциями

In [8]:
from math import radians, cos, sin, asin, sqrt

def calculate_distance(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 
    return c * r

In [9]:
joined_stations = stationData.crossJoin(stationData)
joined_stations.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)



In [None]:
result_2 = joined_stations\
.rdd\
.map(lambda x: calculate_distance(x[2], x[3], x[9], x[10]))\
.max()

result_2

Задание 3 - Найти путь велосипеда с максимальным временем пробега через станции

In [None]:
temp = tripData.orderBy(col("duration").desc())\
.limit(1)\
.collect()

result_3 = joined_stations\
.rdd\
.filter(lambda station_name: (station_name[1] == temp[0].start_station_name) 
        and (station_name[8] == temp[0].end_station_name))\
.map(lambda station: calculate_distance(station[2], station[3], station[9], station[10]))

print(result_3.collect()[0])

Задание 4 - Найти количество велосипедов в системе

In [None]:
result_4 = tripData.select(col("bike_id")).distinct().count()

result_4

Задание 5 - Найти пользователей потративших на поездки более 3 часов.

In [None]:
result_5 = tripData.select(col("id"), col("bike_id"), col("duration"))\
.where("duration > 10800")\
.show()