# combine_files
----

Written in the Python 3.7.9 Environment

By Nicole Lund 

This Jupyter Notebook combines CitiBike trip data into a single file.

In [106]:
# Import dependencies
import pandas as pd
import os
from datetime import datetime

In [107]:
# Get list of files in the folder
file_list = os.listdir()

In [108]:
# Read all files
# Collect header and first row of data for all csv files
combined_df = pd.DataFrame()

for file in file_list:
    if file[-3:] == "csv":
        csv_df = pd.read_csv(file)
        # csv_df['filename'] = file
        if (combined_df.size == 0):
            combined_df = csv_df
        else:
            combined_df = combined_df.append(csv_df)
    else:
        print(file + " not a csv")

citibike small data set.twbx not a csv
combine_files.ipynb not a csv
January_citibike_trip_data.csvold not a csv


In [109]:
print(combined_df.columns)
combined_df.head(3)

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,970,2018-01-01 13:50:57.4340,2018-01-01 14:07:08.1860,72.0,W 52 St & 11 Ave,40.767272,-73.993929,505.0,6 Ave & W 33 St,40.749013,-73.988484,31956,Subscriber,1992,1
1,723,2018-01-01 15:33:30.1820,2018-01-01 15:45:33.3410,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3255.0,8 Ave & W 31 St,40.750585,-73.994685,32536,Subscriber,1969,1
2,496,2018-01-01 15:39:18.3370,2018-01-01 15:47:35.1720,72.0,W 52 St & 11 Ave,40.767272,-73.993929,525.0,W 34 St & 11 Ave,40.755942,-74.002116,16069,Subscriber,1956,1


In [110]:
# Rename Columns
combined_df = combined_df.rename(columns={
    "tripduration": "Ride Duration (minutes)",
    "starttime":"Ride Start Date",
    "stoptime":"Ride End Date"})

In [111]:
# Calculate and add Ride Duration to DataFrame
combined_df['Ride Duration (minutes)'] = combined_df['Ride Duration (minutes)']/60

In [112]:
combined_df.head()

Unnamed: 0,Ride Duration (minutes),Ride Start Date,Ride End Date,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,16.166667,2018-01-01 13:50:57.4340,2018-01-01 14:07:08.1860,72.0,W 52 St & 11 Ave,40.767272,-73.993929,505.0,6 Ave & W 33 St,40.749013,-73.988484,31956,Subscriber,1992,1
1,12.05,2018-01-01 15:33:30.1820,2018-01-01 15:45:33.3410,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3255.0,8 Ave & W 31 St,40.750585,-73.994685,32536,Subscriber,1969,1
2,8.266667,2018-01-01 15:39:18.3370,2018-01-01 15:47:35.1720,72.0,W 52 St & 11 Ave,40.767272,-73.993929,525.0,W 34 St & 11 Ave,40.755942,-74.002116,16069,Subscriber,1956,1
3,5.1,2018-01-01 15:40:13.3720,2018-01-01 15:45:20.1910,72.0,W 52 St & 11 Ave,40.767272,-73.993929,447.0,8 Ave & W 52 St,40.763707,-73.985162,31781,Subscriber,1974,1
4,5.1,2018-01-01 18:14:51.5680,2018-01-01 18:19:57.6420,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3356.0,Amsterdam Ave & W 66 St,40.774667,-73.984706,30319,Subscriber,1992,1


In [113]:
# Calculate and add Age to DataFrame
birth_year = combined_df["birth year"].array
ride_start = combined_df["Ride Start Date"].array
ride_year = []
for ride in ride_start:
    # print(ride)
    ride_year.append(datetime.strptime(ride,'%Y-%m-%d %H:%M:%S.%f').year)
age = ride_year - birth_year
combined_df['Age'] = age

In [114]:
# Calculate and add Ride Count to DataFrame
combined_df['Ride Count'] = age * 0 + 1

In [115]:
# Calculate and add Gender to DataFrame
gender = combined_df["gender"].array
gender_alias = []
for customer in gender:
    if customer == 0:
        customer_gender = "z_Unknown"
    elif customer == 1:
        customer_gender = "Men"
    elif customer == 2:
        customer_gender = "Women"
    else:
        customer_gender = "Undefined"
    gender_alias.append(customer_gender)
combined_df['Gender'] = gender_alias
combined_df = combined_df.drop(columns=['gender'])

In [119]:
combined_df

Unnamed: 0,Ride Duration (minutes),Ride Start Date,Ride End Date,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,Age,Ride Count,Gender
0,16.166667,2018-01-01 13:50:57.4340,2018-01-01 14:07:08.1860,72.0,W 52 St & 11 Ave,40.767272,-73.993929,505.0,6 Ave & W 33 St,40.749013,-73.988484,31956,Subscriber,1992,26,1,Men
1,12.050000,2018-01-01 15:33:30.1820,2018-01-01 15:45:33.3410,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3255.0,8 Ave & W 31 St,40.750585,-73.994685,32536,Subscriber,1969,49,1,Men
2,8.266667,2018-01-01 15:39:18.3370,2018-01-01 15:47:35.1720,72.0,W 52 St & 11 Ave,40.767272,-73.993929,525.0,W 34 St & 11 Ave,40.755942,-74.002116,16069,Subscriber,1956,62,1,Men
3,5.100000,2018-01-01 15:40:13.3720,2018-01-01 15:45:20.1910,72.0,W 52 St & 11 Ave,40.767272,-73.993929,447.0,8 Ave & W 52 St,40.763707,-73.985162,31781,Subscriber,1974,44,1,Men
4,5.100000,2018-01-01 18:14:51.5680,2018-01-01 18:19:57.6420,72.0,W 52 St & 11 Ave,40.767272,-73.993929,3356.0,Amsterdam Ave & W 66 St,40.774667,-73.984706,30319,Subscriber,1992,26,1,Men
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,4.850000,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,41,1,Women
967283,7.283333,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,43,1,Women
967284,2.883333,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,42,1,Men
967285,4.066667,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,25,1,Men


In [117]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1686281 entries, 0 to 967286
Data columns (total 17 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Ride Duration (minutes)  1686281 non-null  float64
 1   Ride Start Date          1686281 non-null  object 
 2   Ride End Date            1686281 non-null  object 
 3   start station id         1686263 non-null  float64
 4   start station name       1686263 non-null  object 
 5   start station latitude   1686281 non-null  float64
 6   start station longitude  1686281 non-null  float64
 7   end station id           1686263 non-null  float64
 8   end station name         1686263 non-null  object 
 9   end station latitude     1686281 non-null  float64
 10  end station longitude    1686281 non-null  float64
 11  bikeid                   1686281 non-null  int64  
 12  usertype                 1686281 non-null  object 
 13  birth year               1686281 non-null  

In [121]:
# Export to csv (do not include index)
combined_df.to_csv("January_citibike_trip_data.csv",index=False)