In [2]:
"""
Created on Sun May 26 14:00:00 2024

@author: thaissa
"""

#EXTRACT DATA FROM AMAZONAWS AND CREATE THE BASE

import pandas as pd
import zipfile
import io
import requests
import seaborn as sbn
sbn.set_theme(style="darkgrid")

#Columns present in both tables from 2010 to 2023
columns = [
    "started_at",
    "ended_at",
    "start_station_name",
    "end_station_name",
    "member_casual",
    "Start date",
    "End date",
    "Start station",
    "End station",
    "Member type",
]

colrename = {
    "Start date": "Start date",
    "End date": "End date",
    "Start station": "Start station",
    "End station": "End station",
    "Member type": "Member type",
    "started_at": "Start date",
    "ended_at": "End date",
    "start_station_name": "Start station",
    "end_station_name": "End station",
    "member_casual": "Member type",
}

coltypes = {
    "Start date": "datetime64[ns]",
    "End date": "datetime64[ns]",
    "Start station": "category",
    "End station": "category",
    "Member type": "category",
}

def selection(colname):
    return colname in columns

tables = []

for i in range(8):
    # URL for the ZIP file
    url = f"https://s3.amazonaws.com/capitalbikeshare-data/201{i}-capitalbikeshare-tripdata.zip"
    print(url)
    # Download the ZIP file
    response = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        # List all files in the ZIP archive
        file_list = z.namelist()
        # Iterate over each file and read the data
        for filename in file_list:
            if filename.endswith('.csv'):
                with z.open(filename) as f:
                    df = pd.read_csv(
                                f, usecols=selection, encoding="unicode_escape"
                            )
                    df.rename(columns=colrename, inplace=True)
                    df["Member type"] = df["Member type"].apply(str.capitalize)
                    df = df.astype(coltypes)
                    #print(filename)
                    #print(df.shape)
                    #print(df.info())
                    tables.append(df)

for i in range(18, 24):
    for j in range(1, 13):
        # URL for the ZIP file
        try:
            url = f"https://s3.amazonaws.com/capitalbikeshare-data/20{i}{j:02d}-capitalbikeshare-tripdata.zip"
            print(url)
            # Download the ZIP file
            response = requests.get(url)
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                # List all files in the ZIP archive
                file_list = z.namelist()
                # Iterate over each file and read the data
                for filename in file_list:
                    if filename.startswith("__MACOSX"):
                        continue
                    if filename.endswith(".csv"):
                        with z.open(filename) as f:
                            df = pd.read_csv(
                                f, usecols=selection, encoding="unicode_escape"
                            )
                            df.rename(columns=colrename, inplace=True)
                            df["Member type"] = df["Member type"].apply(str.capitalize)
                            df = df.astype(coltypes)
                            #print(filename)
                            #print(df.shape)
                            #print(df.info())
                            tables.append(df)
        except Exception as error:
            print(f"error with {url} {error}")

# Combine all dataframes into a single dataframe
df_2010_2023 = pd.concat(tables, ignore_index=True)
del tables

# Print the combined dataframe
print(df_2010_2023.shape)
print(df_2010_2023.info())
df_2010_2023


https://s3.amazonaws.com/capitalbikeshare-data/2010-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2011-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2012-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2013-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2014-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2015-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2016-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/2017-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/201801-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/201802-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/201803-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/201804-capitalbikeshare-tripdata.zip
https://

Unnamed: 0,Start date,End date,Start station,End station,Member type
0,2010-09-20 11:27:04,2010-09-20 11:43:56,M St & New Jersey Ave SE,4th & M St SW,Member
1,2010-09-20 11:41:22,2010-09-20 11:42:23,1st & N St SE,1st & N St SE,Member
2,2010-09-20 12:05:37,2010-09-20 12:50:27,5th & K St NW,19th St & Pennsylvania Ave NW,Member
3,2010-09-20 12:06:05,2010-09-20 12:29:32,5th & K St NW,Park Rd & Holmead Pl NW,Member
4,2010-09-20 12:10:43,2010-09-20 12:34:17,19th St & Pennsylvania Ave NW,15th & P St NW,Member
...,...,...,...,...,...
38894179,2023-12-16 00:08:39,2023-12-16 00:16:30,11th & S St NW,17th & P St NW,Member
38894180,2023-12-06 18:23:32,2023-12-06 18:27:26,Eastern Market / 7th & North Carolina Ave SE,Massachusetts Ave & 6th St NE,Member
38894181,2023-12-09 17:47:53,2023-12-09 18:03:16,8th & D St NW,16th & R St NW,Casual
38894182,2023-12-09 18:14:52,2023-12-09 18:25:48,5th & F St NW,Lincoln Rd & Seaton Pl NE/Harry Thomas Rec Center,Casual


In [3]:
#REMOVE DUPLICATES AND LINES WITH AT LEAST ONE VALUE NULL
df_2010_2023.drop_duplicates(inplace=True)
df_2010_2023.dropna(inplace=True)
df_2010_2023

Unnamed: 0,Start date,End date,Start station,End station,Member type
0,2010-09-20 11:27:04,2010-09-20 11:43:56,M St & New Jersey Ave SE,4th & M St SW,Member
1,2010-09-20 11:41:22,2010-09-20 11:42:23,1st & N St SE,1st & N St SE,Member
2,2010-09-20 12:05:37,2010-09-20 12:50:27,5th & K St NW,19th St & Pennsylvania Ave NW,Member
3,2010-09-20 12:06:05,2010-09-20 12:29:32,5th & K St NW,Park Rd & Holmead Pl NW,Member
4,2010-09-20 12:10:43,2010-09-20 12:34:17,19th St & Pennsylvania Ave NW,15th & P St NW,Member
...,...,...,...,...,...
38894179,2023-12-16 00:08:39,2023-12-16 00:16:30,11th & S St NW,17th & P St NW,Member
38894180,2023-12-06 18:23:32,2023-12-06 18:27:26,Eastern Market / 7th & North Carolina Ave SE,Massachusetts Ave & 6th St NE,Member
38894181,2023-12-09 17:47:53,2023-12-09 18:03:16,8th & D St NW,16th & R St NW,Casual
38894182,2023-12-09 18:14:52,2023-12-09 18:25:48,5th & F St NW,Lincoln Rd & Seaton Pl NE/Harry Thomas Rec Center,Casual


In [None]:
df_2010_2023.to_parquet("base.parquet")