# Reading multiple csv files with Spark


In [2]:
# Import our SparkSession so we can use it
from pyspark.sql import SparkSession
# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("basics").getOrCreate()

In [7]:
# Load in data
from pyspark import SparkFiles #same
url = "Resources/Turnstile_data/*.csv"
df = spark.read.csv(url, header=True, mode="DROPMALFORMED")
df.show() 

+----+----+--------+-------------+---------+--------+----------+--------+-----------+-------+----------------------------------------------------------+
| C/A|Unit|     SCP|      Station|Line Name|Division|      Date|    Time|Description|Entries|Exits                                                     |
+----+----+--------+-------------+---------+--------+----------+--------+-----------+-------+----------------------------------------------------------+
|A002|R051|02-00-00|LEXINGTON AVE|   NQR456|     BMT|12/31/2014|23:00:00|    REGULAR|4943320|                                                   1674736|
|A002|R051|02-00-00|LEXINGTON AVE|   NQR456|     BMT|12/31/2014|19:00:00|    REGULAR|4943145|                                                   1674709|
|A002|R051|02-00-00|LEXINGTON AVE|   NQR456|     BMT|12/31/2014|15:00:00|    REGULAR|4942439|                                                   1674636|
|A002|R051|02-00-00|LEXINGTON AVE|   NQR456|     BMT|12/31/2014|11:00:00|    REGUL

In [8]:
# Print our schema
df.printSchema()

root
 |-- C/A: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- SCP: string (nullable = true)
 |-- Station: string (nullable = true)
 |-- Line Name: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Entries: string (nullable = true)
 |-- Exits                                                     : string (nullable = true)



In [9]:
# Show the columns
df.columns

['C/A',
 'Unit',
 'SCP',
 'Station',
 'Line Name',
 'Division',
 'Date',
 'Time',
 'Description',
 'Entries',
 'Exits                                                     ']

In [10]:
df['Station']

Column<b'Station'>

In [12]:
rider_data = df.select('Date','Station', 'Entries').show()

+----------+-------------+-------+
|      Date|      Station|Entries|
+----------+-------------+-------+
|12/31/2014|LEXINGTON AVE|4943320|
|12/31/2014|LEXINGTON AVE|4943145|
|12/31/2014|LEXINGTON AVE|4942439|
|12/31/2014|LEXINGTON AVE|4942012|
|12/31/2014|LEXINGTON AVE|4941987|
|12/31/2014|LEXINGTON AVE|4941910|
|12/31/2014|LEXINGTON AVE|4941892|
|12/30/2014|LEXINGTON AVE|4941835|
|12/30/2014|LEXINGTON AVE|4941463|
|12/30/2014|LEXINGTON AVE|4940620|
|12/30/2014|LEXINGTON AVE|4940338|
|12/30/2014|LEXINGTON AVE|4940232|
|12/30/2014|LEXINGTON AVE|4940218|
|12/29/2014|LEXINGTON AVE|4940171|
|12/29/2014|LEXINGTON AVE|4939811|
|12/29/2014|LEXINGTON AVE|4938984|
|12/29/2014|LEXINGTON AVE|4938701|
|12/29/2014|LEXINGTON AVE|4938574|
|12/29/2014|LEXINGTON AVE|4938566|
|12/28/2014|LEXINGTON AVE|4938544|
+----------+-------------+-------+
only showing top 20 rows



In [13]:
rider_data.groupby(by='Stations')

AttributeError: 'NoneType' object has no attribute 'groupby'

In [9]:
df.orderBy(df["Entries"].desc()).head(5)

[Row(C/A='R626', Unit='R062', SCP='00-00-00', Station='CROWN HTS-UTICA', Line Name='34', Division='IRT', Date='06/17/2014', Time='08:00:00', Description='REGULAR', Entries='9999999', Exits                                                     ='2242502'),
 Row(C/A='B019', Unit='R149', SCP='00-00-02', Station='NEWKIRK PLAZA', Line Name='BQ', Division='BMT', Date='01/22/2018', Time='20:00:00', Description='REGULAR', Entries='9999997', Exits                                                     ='11600216'),
 Row(C/A='R248', Unit='R178', SCP='00-00-07', Station='77 ST', Line Name='6', Division='IRT', Date='11/29/2017', Time='16:00:00', Description='REGULAR', Entries='9999996', Exits                                                     ='1413693'),
 Row(C/A='H007', Unit='R248', SCP='00-00-00', Station='1 AVE', Line Name='L', Division='BMT', Date='08/08/2015', Time='00:00:00', Description='REGULAR', Entries='9999995', Exits                                                     ='10879540'),
 Row(C

In [10]:
df.count()

49230245

In [11]:
import pandas as pd
import numpy as np

import datetime as dt
import re
import time

import matplotlib.pyplot as plt

import seaborn as sns
