# Reading a CSV file

In [1]:
import os
curr_dir = os.getcwd()
print("Running from {}".format(curr_dir))

Running from /workdir/spark-3.0.1-bin-hadoop2.7-hive1.2/spark-ml/src/main/jupyter


In [2]:
from pyspark.sql import SparkSession

# Use the `builder` attribute to construct an instance of the `SparkSession` class:
spark = SparkSession.builder \
    .appName("spark") \
    .getOrCreate()

# Spark version number:
spark.version

'3.0.1'

### Loading the data

In [3]:
RIDES_PATH = "file:///workdir/spark-3.0.1-bin-hadoop2.7-hive1.2/spark-ml/duocar/raw/rides"
# Use the `csv` method of the `DataFrameReader` class to read the raw ride data
# from HDFS into a DataFrame:
# rides = spark.read.csv("/duocar/raw/rides/", sep=",", header=True, inferSchema=True)
rides = spark.read.csv(RIDES_PATH, sep=",", header=True, inferSchema=True)

### Examining the schema of a DataFrame

In [4]:
rides.printSchema()

root
 |-- id: integer (nullable = true)
 |-- driver_id: long (nullable = true)
 |-- rider_id: long (nullable = true)
 |-- date_time: string (nullable = true)
 |-- utc_offset: integer (nullable = true)
 |-- service: string (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_lon: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_lon: double (nullable = true)
 |-- distance: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- cancelled: integer (nullable = true)
 |-- star_rating: integer (nullable = true)



Access the `columns` attribute to get a list of column names:

In [5]:
rides.columns

['id',
 'driver_id',
 'rider_id',
 'date_time',
 'utc_offset',
 'service',
 'origin_lat',
 'origin_lon',
 'dest_lat',
 'dest_lon',
 'distance',
 'duration',
 'cancelled',
 'star_rating']

Access the `dtypes` attribute to get a list of column names and data types:

In [6]:
rides.dtypes

[('id', 'int'),
 ('driver_id', 'bigint'),
 ('rider_id', 'bigint'),
 ('date_time', 'string'),
 ('utc_offset', 'int'),
 ('service', 'string'),
 ('origin_lat', 'double'),
 ('origin_lon', 'double'),
 ('dest_lat', 'double'),
 ('dest_lon', 'double'),
 ('distance', 'int'),
 ('duration', 'int'),
 ('cancelled', 'int'),
 ('star_rating', 'int')]

Access the `schema` attribute to get the schema as a instance of the `StructType` class:

In [7]:
rides.schema

StructType(List(StructField(id,IntegerType,true),StructField(driver_id,LongType,true),StructField(rider_id,LongType,true),StructField(date_time,StringType,true),StructField(utc_offset,IntegerType,true),StructField(service,StringType,true),StructField(origin_lat,DoubleType,true),StructField(origin_lon,DoubleType,true),StructField(dest_lat,DoubleType,true),StructField(dest_lon,DoubleType,true),StructField(distance,IntegerType,true),StructField(duration,IntegerType,true),StructField(cancelled,IntegerType,true),StructField(star_rating,IntegerType,true)))

### Computing the number of rows and columns of a DataFrame

Call the `count` method to compute the number of rows:

In [8]:
rides.count()

48775

Pass the list of column names to the Python `len` function to compute the number of columns:

In [9]:
len(rides.columns)

14

### Examining a few rows of a DataFrame

Call the `show` method to print some rows of a DataFrame:

In [10]:
rides.show(5)
rides.show(5, truncate=5)
rides.show(5, vertical=True)

+---+------------+------------+----------------+----------+-------+----------+----------+---------+----------+--------+--------+---------+-----------+
| id|   driver_id|    rider_id|       date_time|utc_offset|service|origin_lat|origin_lon| dest_lat|  dest_lon|distance|duration|cancelled|star_rating|
+---+------------+------------+----------------+----------+-------+----------+----------+---------+----------+--------+--------+---------+-----------+
|  1|220200000214|220200000084|2017-02-01 00:14|        -6|   null| 46.850956|-96.902849| 46.86005|-96.825442|   10123|     729|        0|          5|
|  2|220200000107|220200000462|2017-02-01 00:36|        -6|   null| 46.900432|-96.765807|46.840588|-96.868087|   16043|    1299|        0|          5|
|  3|220200000214|220200000489|2017-02-01 02:26|        -6|   Noir| 46.868382|-96.902718|46.815272|-96.862056|    9362|     736|        0|          5|
|  4|220200000067|220200000057|2017-02-01 03:00|        -6|   null| 46.908567|-96.905391| 46.9

Call the `head` or `take` method to get a list of `Row` objects from a DataFrame:

In [11]:
rides.head(5)
rides.take(5)

[Row(id=1, driver_id=220200000214, rider_id=220200000084, date_time='2017-02-01 00:14', utc_offset=-6, service=None, origin_lat=46.850956, origin_lon=-96.902849, dest_lat=46.86005, dest_lon=-96.825442, distance=10123, duration=729, cancelled=0, star_rating=5),
 Row(id=2, driver_id=220200000107, rider_id=220200000462, date_time='2017-02-01 00:36', utc_offset=-6, service=None, origin_lat=46.900432, origin_lon=-96.765807, dest_lat=46.840588, dest_lon=-96.868087, distance=16043, duration=1299, cancelled=0, star_rating=5),
 Row(id=3, driver_id=220200000214, rider_id=220200000489, date_time='2017-02-01 02:26', utc_offset=-6, service='Noir', origin_lat=46.868382, origin_lon=-96.902718, dest_lat=46.815272, dest_lon=-96.862056, distance=9362, duration=736, cancelled=0, star_rating=5),
 Row(id=4, driver_id=220200000067, rider_id=220200000057, date_time='2017-02-01 03:00', utc_offset=-6, service=None, origin_lat=46.908567, origin_lon=-96.905391, dest_lat=46.90438, dest_lon=-96.793999, distance=90

### Some more

In [12]:
rides.printSchema()
r5 = rides.limit(5)
r5.show()
rp = r5.toPandas()
rp
r5.describe().show()
r5.describe().toPandas()

root
 |-- id: integer (nullable = true)
 |-- driver_id: long (nullable = true)
 |-- rider_id: long (nullable = true)
 |-- date_time: string (nullable = true)
 |-- utc_offset: integer (nullable = true)
 |-- service: string (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_lon: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_lon: double (nullable = true)
 |-- distance: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- cancelled: integer (nullable = true)
 |-- star_rating: integer (nullable = true)

+---+------------+------------+----------------+----------+-------+----------+----------+---------+----------+--------+--------+---------+-----------+
| id|   driver_id|    rider_id|       date_time|utc_offset|service|origin_lat|origin_lon| dest_lat|  dest_lon|distance|duration|cancelled|star_rating|
+---+------------+------------+----------------+----------+-------+----------+----------+---------+----------+--------+------

Unnamed: 0,summary,id,driver_id,rider_id,date_time,utc_offset,service,origin_lat,origin_lon,dest_lat,dest_lon,distance,duration,cancelled,star_rating
0,count,5.0,5.0,5.0,5,5.0,1,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
1,mean,3.0,220200000133.8,220200000220.8,,-6.0,,46.8848402,-96.8565144,46.857864,-96.8269632,9932.8,851.6,0.0,5.0
2,stddev,1.5811388300841898,75.01133530809182,234.12112249859047,,0.0,,0.0242208628500331,0.0660864257378173,0.0331878556402808,0.0379246526773824,3938.490154868995,250.8959943881129,0.0,0.0
3,min,1.0,220200000067.0,220200000012.0,2017-02-01 00:14,-6.0,Noir,46.850956,-96.905391,46.815272,-96.868087,5076.0,721.0,0.0,5.0
4,max,5.0,220200000214.0,220200000489.0,2017-02-01 03:49,-6.0,Noir,46.908567,-96.765807,46.90438,-96.785232,16043.0,1299.0,0.0,5.0


### Stopping the application
Call the `stop` method to stop the application:

In [13]:
spark.stop()