In [3]:
import geopandas as gpd
import pandas as pd
import os

import zipfile
from tqdm.autonotebook import tqdm
import warnings

In [4]:
warnings.filterwarnings("ignore")

In [5]:
data_dir = os.path.join(os.getcwd(), "data")
unzip_files = False  # change if u want to unzip files

In [6]:
if unzip_files:
    for filename in os.listdir(data_dir):
        if filename.endswith(".zip"):
            filepath = os.path.join(data_dir, filename)
            with zipfile.ZipFile(filepath, "r") as zip_ref:
                zip_ref.extractall(data_dir)
                print(f"Extracted {filepath} to data_dir")

In [6]:
def csv_to_parquet(input_dir, verbose=False):
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".csv"):
                csv_path = os.path.join(root, file)

                df = pd.read_csv(csv_path, dtype=object)

                parquet_path = csv_path.replace(".csv", ".parquet")
                df.to_parquet(parquet_path)

                if verbose:
                    print(f"{parquet_path} saved in dir")

                os.remove(csv_path)
                if verbose:
                    print(f"{csv_path} deleted")

In [None]:
csv_to_parquet(
    "C:/Users/Kacper Kozaczko/Desktop/Stuff/PWr/II_semestr/Spatial/GEO_EDA/preprocessing/nyc_bike/data/2013-citibike-tripdata",
    verbose=True,
)

In [81]:
for year_dir in tqdm(
    os.listdir(data_dir), desc="Year directories transfered", colour="magenta"
):
    year_dir_path = os.path.join(data_dir, year_dir)
    csv_to_parquet(year_dir_path, verbose=False)
    print(f"Transfered to .parquet dir: {year_dir_path}")

Year directories transfered:   0%|          | 0/11 [00:00<?, ?it/s]

Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2013-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2014-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2015-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2016-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2017-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preprocessing\nyc_bike\data\2018-citibike-tripdata
Transfered to .parquet dir: c:\Users\Kacper Kozaczko\Desktop\Stuff\PWr\II_semestr\Spatial\GEO_EDA\preproce

In [7]:
data_2013 = pd.read_parquet(
    os.path.join(
        data_dir,
        "2013-citibike-tripdata",
        "6_June",
        "201306-citibike-tripdata_1.parquet",
    )
)

In [8]:
data_2013.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,695,2013-06-01 00:00:01,2013-06-01 00:11:36,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,19678,Subscriber,1983.0,1
1,693,2013-06-01 00:00:08,2013-06-01 00:11:41,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,16649,Subscriber,1984.0,1
2,2059,2013-06-01 00:00:44,2013-06-01 00:35:03,406,Hicks St & Montague St,40.695128,-73.995951,406.0,Hicks St & Montague St,40.695128,-73.995951,19599,Customer,,0
3,123,2013-06-01 00:01:04,2013-06-01 00:03:07,475,E 15 St & Irving Pl,40.735243,-73.987586,262.0,Washington Park,40.691782,-73.97373,16352,Subscriber,1960.0,1
4,1521,2013-06-01 00:01:22,2013-06-01 00:26:43,2008,Little West St & 1 Pl,40.705693,-74.016777,310.0,State St & Smith St,40.689269,-73.989129,15567,Subscriber,1983.0,1


In [10]:
data_2023 = pd.read_parquet(
    os.path.join(
        data_dir,
        "2023-citibike-tripdata",
        "6_June",
        "202306-citibike-tripdata_1.parquet",
    )
)

In [20]:
data_2023.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,984F50BCBC76DD9A,classic_bike,2023-06-11 06:54:21,2023-06-11 07:12:28,W 84 St & Columbus Ave,7382.04,Amsterdam Ave & W 125 St,7800.03,40.78499979,-73.97283406,40.813358,-73.956461,member
1,03E3D62E7FB76B05,classic_bike,2023-06-19 15:23:11,2023-06-19 16:00:05,E 89 St & York Ave,7204.08,Mott St & Prince St,5561.04,40.777957678,-73.945928335,40.72317958,-73.99480012,member
2,8E7EE421A0B8BBF3,classic_bike,2023-06-06 16:07:05,2023-06-06 16:15:14,E 51 St & 2 Ave,6575.03,E 25 St & 1 Ave,6004.07,40.755357265,-73.967488885,40.7381765,-73.97738662,member
3,24D66A0C46493CB1,classic_bike,2023-06-26 19:52:23,2023-06-26 19:55:47,India St Pier,5794.05,Franklin St & Dupont St,5944.01,40.731734785883454,-73.9612390102593,40.73564,-73.95866,member
4,E944882A074B8F61,classic_bike,2023-06-05 08:57:57,2023-06-05 09:13:36,E 47 St & 2 Ave,6498.1,5 Ave & E 29 St,6248.06,40.75323098,-73.97032517,40.7451677,-73.98683077,member


In [14]:
print(data_2023.member_casual.unique())
print(data_2013.usertype.unique())

['member' 'casual']
['Subscriber' 'Customer']


In [7]:
data_2013.columns

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')

In [8]:
data_2023.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [10]:
len(data_2013.columns)

15

## Analiza

lata 2013-2020 włącznie mają kolumny: ['tripduration', 
'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender']

lata 2021-2023 mają kolumny: ['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual']


### W pierwszej grupie nie ma: 

ride_id, 

rideable_type

member_casual



### W drugiej grupie nie ma:

tripduration -> da się wyliczić i dodać

gender

birthyear

bike_id

usertype


# Merge into table per year 2013-2020

In [8]:
for year_dir in tqdm(
    os.listdir(data_dir), desc="Year directories transfered", colour="magenta"
):
    print(year_dir[:4])
    year_dir_path = os.path.join(data_dir, year_dir)

Year directories transfered: 100%|[35m██████████[0m| 13/13 [00:00<?, ?it/s]

2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
chec
chec





In [36]:
default_columns = list(data_2013.columns)


def concatenate_dfs(start_year=2013, end_year=2020, default_columns=default_columns):
    concatenated_df = pd.DataFrame(columns=default_columns)
    dirs = [
        item
        for item in os.listdir(data_dir)
        if os.path.isdir(os.path.join(data_dir, item))
    ]
    for year_dir in tqdm(
        dirs,
        desc="Year analyzed",
        colour="magenta",
        total=end_year - start_year + 1,
        leave=False,
    ):
        if start_year <= int(year_dir[:4]) <= end_year:
            year_dir_path = os.path.join(data_dir, year_dir)
            for root, _, files in os.walk(year_dir_path):
                for file in files:
                    if file.endswith(".parquet"):
                        path = os.path.join(root, file)
                        df = pd.read_parquet(path)
                        concatenated_df = pd.concat(
                            [concatenated_df, df], axis=0, ignore_index=True
                        )
            name = (
                f"nyc_bike_{year_dir[:4]}.parquet"
                if start_year == end_year
                else f"nyc_bike{start_year}-{year_dir[:4]}.parquet"
            )
            df.to_parquet(os.path.join(data_dir, name))
    concatenated_df.drop_duplicates(inplace=True)

    return concatenated_df

In [30]:
os.listdir(data_dir)

['2013-citibike-tripdata',
 '2014-citibike-tripdata',
 '2015-citibike-tripdata',
 '2016-citibike-tripdata',
 '2017-citibike-tripdata',
 '2018-citibike-tripdata',
 '2019-citibike-tripdata',
 '2020-citibike-tripdata',
 '2021-citibike-tripdata',
 '2022-citibike-tripdata',
 '2023-citibike-tripdata',
 'checkpoint_nyc_bike2013-2014.parquet',
 'checkpoint_nyc_bike2013-2017.parquet',
 'nyc_bike2013-2013.parquet']

In [31]:
years_array = [
    int(year_dir[:4])
    for year_dir in os.listdir(data_dir)
    if os.path.isdir(os.path.join(data_dir, year_dir))
]

In [32]:
years_array

[2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [37]:
for year in tqdm(years_array, desc="Year analyzed", colour="magenta"):
    concatenate_dfs(year, year)

Year analyzed:   0%|          | 0/11 [00:00<?, ?it/s]

Year analyzed:   0%|          | 0/1 [00:00<?, ?it/s]

# Load using geopandas

In [9]:
data_2023 = pd.read_parquet(os.path.join(data_dir, "nyc_bike_2023.parquet"))