# Reading multiple csv files with Spark


In [1]:
# Import our SparkSession so we can use it
from pyspark.sql import SparkSession
# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("basics").getOrCreate()

In [2]:
# Load in data
from pyspark import SparkFiles #same
url = "Resources/Turnstile_data/*.csv"
df = spark.read.csv(url, header=True, mode="DROPMALFORMED")
df.show() 

+----+----+--------+--------------+---------+--------+----------+--------+-----------+--------+----------------------------------------------------------+
| C/A|Unit|     SCP|       Station|Line Name|Division|      Date|    Time|Description| Entries|Exits                                                     |
+----+----+--------+--------------+---------+--------+----------+--------+-----------+--------+----------------------------------------------------------+
|A033|R170|02-00-00|14 ST-UNION SQ| LNQR456W|     BMT|12/28/2018|00:00:00|    REGULAR| 2056268|                                                   5177036|
|A033|R170|02-06-01|14 ST-UNION SQ| LNQR456W|     BMT|12/28/2018|00:00:00|    REGULAR|70294362|                                                  20274025|
|A033|R170|02-00-02|14 ST-UNION SQ| LNQR456W|     BMT|12/28/2018|00:00:00|    REGULAR|14197229|                                                  13704110|
|A033|R170|02-00-01|14 ST-UNION SQ| LNQR456W|     BMT|12/28/2018|00:00

In [3]:
# Print our schema
df.printSchema()

root
 |-- C/A: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- SCP: string (nullable = true)
 |-- Station: string (nullable = true)
 |-- Line Name: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Entries: string (nullable = true)
 |-- Exits                                                     : string (nullable = true)



In [4]:
# Show the columns
df.columns

['C/A',
 'Unit',
 'SCP',
 'Station',
 'Line Name',
 'Division',
 'Date',
 'Time',
 'Description',
 'Entries',
 'Exits                                                     ']

In [5]:
df['Station']

Column<b'Station'>

In [6]:
df.select('Date', 'Entries').show()

+----------+--------+
|      Date| Entries|
+----------+--------+
|12/28/2018| 2056268|
|12/28/2018|70294362|
|12/28/2018|14197229|
|12/28/2018| 1806541|
|12/28/2018|15598097|
|12/28/2018| 6069026|
|12/28/2018| 4927946|
|12/28/2018| 1806541|
|12/28/2018|  694109|
|12/28/2018|70294362|
|12/28/2018| 2056268|
|12/28/2018| 4927946|
|12/28/2018| 6069026|
|12/28/2018|  694109|
|12/28/2018|14197229|
|12/28/2018|15598097|
|12/28/2018| 1923632|
|12/28/2018| 3687618|
|12/28/2018| 1923632|
|12/28/2018|68007701|
+----------+--------+
only showing top 20 rows



In [7]:
# import pandas as pd
# pandas_df = df.toPandas() 

In [9]:
df.orderBy(df["Entries"].desc()).head(5)

[Row(C/A='R626', Unit='R062', SCP='00-00-00', Station='CROWN HTS-UTICA', Line Name='34', Division='IRT', Date='06/17/2014', Time='08:00:00', Description='REGULAR', Entries='9999999', Exits                                                     ='2242502'),
 Row(C/A='B019', Unit='R149', SCP='00-00-02', Station='NEWKIRK PLAZA', Line Name='BQ', Division='BMT', Date='01/22/2018', Time='20:00:00', Description='REGULAR', Entries='9999997', Exits                                                     ='11600216'),
 Row(C/A='R248', Unit='R178', SCP='00-00-07', Station='77 ST', Line Name='6', Division='IRT', Date='11/29/2017', Time='16:00:00', Description='REGULAR', Entries='9999996', Exits                                                     ='1413693'),
 Row(C/A='H007', Unit='R248', SCP='00-00-00', Station='1 AVE', Line Name='L', Division='BMT', Date='08/08/2015', Time='00:00:00', Description='REGULAR', Entries='9999995', Exits                                                     ='10879540'),
 Row(C

In [10]:
df.count()

49230245

In [11]:
import pandas as pd
import numpy as np

import datetime as dt
import re
import time

import matplotlib.pyplot as plt

import seaborn as sns
