In [1]:
# -*- coding: utf-8 -*-

"""

Created on Jul 06 10:51:18 2020



@author: Wenqing Zhong and Julian Briggs

"""

'''

get_final_station_data(url,numofweek,station)

input: url for the turnstile website, how many weeks' data do you need, which station you want to work with

output: a csv file that includes data for one station

'''





from bs4 import BeautifulSoup

import requests

from urllib.parse import urlparse, urljoin

from itertools import islice

import pandas as pd

import os

import csv

import numpy as np





url = "http://web.mta.info/developers/turnstile.html"# the turnstile data website url



def is_valid(url):#check if the url is valid

    parsed = urlparse(url)

    return bool(parsed.netloc) and bool(parsed.scheme)



def get_all_urls(url, limit):

    #"limit" is the number of urls that you want to open, each url stores data for one week

    if(is_valid(url)==False):

        print("ERROR WITH URL INPUT")

       

    base=url

    soup = BeautifulSoup(requests.get(url).content, "html.parser")    

    urls=[]

    

    for link in islice(soup.find_all('a'),limit+37): 

        #find all internal urls in the trunstile data website

        #usually the first 37 internal urls are uesless, they don't store turnstile data so we skip them       

        href = link.attrs.get("href")   

        if(str(href).startswith("data")== True):#all internal urls that store turnstile data are in html format and they start with the word 'data'

            full=urljoin(base, href)

            urls.append(full)# store turnstile data urls        

    return urls





def get_one_csv(url,i):

    r = requests.get(url)#go to one turnstile data url(each url stores data for one week)

    

    with open('data%s.txt'%i, 'w') as file:#write the data in a text file

        try:

            file.write(r.text) #update the text file if it already exists

        except:

            return False

        

    df = pd.read_csv("data%s.txt"%i,delimiter=',')#convert the text file into a csv file

    df.to_csv('data%s.csv'%i)

    os.remove("data%s.txt"%i)

    # I tried to directly create csv files but for some reasons the format is wrong, so I used text files as intermediates 

    return





def get_all_csv(url,limit):# open 'limit' number of turnstile data urls and store data in csv files

    links=get_all_urls(url, limit)

    for i in range(len(links)):

        get_one_csv(links[i],i)        

    return len(links)





def get_station(station,linename,numofcsv): #get turnstile data of a certain station

    

    with open('%s.csv'%station, 'w',newline='') as csvfile: 

        with open('data0.csv') as head:

            headreader = csv.reader(head)  

            head= next(headreader)

            writer = csv.writer(csvfile, delimiter=",", quoting=csv.QUOTE_NONE) 

            writer.writerow(head)

            

        for i in range(numofcsv):

            with open('data%s.csv'%i) as f_obj:

                reader = csv.reader(f_obj)  

                for line in reader:      

                    if ((station in line) and (linename in line)):

                        writer.writerow(line)

    return





def get_final_station_data(url,numofweek,station,linename):

    #input: url for the turnstile website, how many weeks' data that you need, and which station you want to work with

    numofcsv=get_all_csv(url,numofweek)

    get_station(station,linename,numofcsv)

    

    for i in range(numofcsv):

        os.remove("data%s.csv"%i)

    return



get_final_station_data(url,1,"72 ST","123") #user inputs station name and what lines go to that station


In [2]:
path=r"C:\Users\Julia\Documents\Subway\code\72 ST.csv" #path for where to look for new station CSV
new_data=pd.read_csv(path)
new_data

Unnamed: 0.1,Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,142063,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,01:00:00,REGULAR,173770,714636
1,142064,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,05:00:00,REGULAR,173770,714646
2,142065,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,09:00:00,REGULAR,173777,714700
3,142066,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,13:00:00,REGULAR,173785,714806
4,142067,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,17:00:00,REGULAR,173818,714951
5,142068,R161A,R452,01-00-00,72 ST,123,IRT,07/04/2020,21:00:00,REGULAR,173856,715040
6,142069,R161A,R452,01-00-00,72 ST,123,IRT,07/05/2020,01:00:00,REGULAR,173878,715108
7,142070,R161A,R452,01-00-00,72 ST,123,IRT,07/05/2020,05:00:00,REGULAR,173878,715111
8,142071,R161A,R452,01-00-00,72 ST,123,IRT,07/05/2020,09:00:00,REGULAR,173887,715160
9,142072,R161A,R452,01-00-00,72 ST,123,IRT,07/05/2020,13:00:00,REGULAR,173899,715282


In [4]:
day_slice=new_data.loc[(new_data['DATE']=='07/08/2020')] #select what day you want to look at
day_slice

Unnamed: 0.1,Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
24,142087,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,01:00:00,REGULAR,174458,717219
25,142088,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,05:00:00,REGULAR,174458,717220
26,142089,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,09:00:00,REGULAR,174470,717455
27,142090,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,13:00:00,REGULAR,174503,717690
28,142091,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,17:00:00,REGULAR,174603,717874
29,142092,R161A,R452,01-00-00,72 ST,123,IRT,07/08/2020,21:00:00,REGULAR,174669,718033
66,142129,R161A,R452,01-00-01,72 ST,123,IRT,07/08/2020,01:00:00,REGULAR,175659,338980
67,142130,R161A,R452,01-00-01,72 ST,123,IRT,07/08/2020,05:00:00,REGULAR,175659,338981
68,142131,R161A,R452,01-00-01,72 ST,123,IRT,07/08/2020,09:00:00,REGULAR,175673,339101
69,142132,R161A,R452,01-00-01,72 ST,123,IRT,07/08/2020,13:00:00,REGULAR,175704,339245


In [5]:
np.unique(day_slice['TIME'],return_counts=True) #displays how many times each timestamp is repeated, which is the number of turnstiles at station
#Array ([X]) ---> X = number of turnstiles ta station

(array(['01:00:00', '05:00:00', '09:00:00', '13:00:00', '17:00:00',
        '21:00:00'], dtype=object),
 array([22, 22, 22, 22, 22, 22], dtype=int64))

In [25]:
#we need to:
#1. add up all entries and exits from every time slot for each day (add up each turnstile)
#2. subtract entries to get raw number for each time slot
#3. visualize the raw entries somehow

NameError: name 'station_slice' is not defined