First the imports

In [1]:
import os 
import shutil # to copy files across directories

import pandas as pd
import numpy as np
import csv
import calendar

# Exploration & Cleaning:

We want to have the data in an accessible form. This is done by reading the data into a dictionary of panada dataframes. Each element in the dictionary has a key (which is the name of the month the data was collected in) and a value (the csv entries in a panda dataframe) 

In [2]:
# returns a dictionary containing the data from the .csv files
def read_data(dir):
    dfs = {}
    files = os.listdir(dir)

    # filter files by .csv bec some are google sheets
    # extract the number of the month, then sort by month so that they are then stored in order of month
    csv_files = sorted([int(file[4:6]) for file in files if file.endswith('.csv')])
    
    for csv_file in csv_files:
        # reconstruct the name of file bec we extracted the month number to sort them by month
        reconstructed_name = '2023'+str(csv_file)+'-divvy-tripdata.csv' if csv_file >= 10 else '20230'+str(csv_file)+'-divvy-tripdata.csv'
        f = os.path.join(dir, reconstructed_name) 
        
        # get month name
        month = calendar.month_name[csv_file]
        
        # read csv file into dataframe
        dfs[month] = pd.read_csv(f)

    return dfs

In [3]:
original_dir = "../../BikeShareData/OriginalData"
data = read_data(original_dir)

In [4]:
# no of files should be 11
no_of_files = len(data)
print(no_of_files)

11


In [51]:
month_names = data.keys()
print(month_names)

dict_keys(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November'])


In [52]:
# the first and last 5 entries
#data["January"]
#data["February"]
#data["March"]
#data["April"]
data["May"]
#data["June"]
#data["July"]
#data["August"]
#data["September"]
#data["October"]
#data["November"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0D9FA920C3062031,electric_bike,2023-05-07 19:53:48,2023-05-07 19:58:32,Southport Ave & Belmont Ave,13229,,,41.939408,-87.663831,41.930000,-87.650000,member
1,92485E5FB5888ACD,electric_bike,2023-05-06 18:54:08,2023-05-06 19:03:35,Southport Ave & Belmont Ave,13229,,,41.939482,-87.663848,41.940000,-87.690000,member
2,FB144B3FC8300187,electric_bike,2023-05-21 00:40:21,2023-05-21 00:44:36,Halsted St & 21st St,13162,,,41.853793,-87.646719,41.860000,-87.650000,member
3,DDEB93BC2CE9AA77,classic_bike,2023-05-10 16:47:01,2023-05-10 16:59:52,Carpenter St & Huron St,13196,Damen Ave & Cortland St,13133,41.894556,-87.653449,41.915983,-87.677335,member
4,C07B70172FC92F59,classic_bike,2023-05-09 18:30:34,2023-05-09 18:39:28,Southport Ave & Clark St,TA1308000047,Southport Ave & Belmont Ave,13229,41.957081,-87.664199,41.939478,-87.663748,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
604822,48BDA26F34445546,electric_bike,2023-05-18 10:26:43,2023-05-18 10:48:00,Clark St & Elmdale Ave,KA1504000148,,,41.990876,-87.669721,42.000000,-87.660000,member
604823,573025E5EDE10DE1,electric_bike,2023-05-17 14:32:48,2023-05-17 14:45:37,State St & 33rd St,13216,,,41.834734,-87.625798,41.830000,-87.620000,member
604824,D88D48898C6FB63E,electric_bike,2023-05-17 07:59:29,2023-05-17 08:04:54,Columbus Dr & Randolph St,13263,,,41.884422,-87.619393,41.880000,-87.630000,member
604825,4692DCD2F87497F5,electric_bike,2023-05-18 08:34:48,2023-05-18 08:38:40,Public Rack - Karlov Ave & Lawrence Ave,1127.0,,,41.970000,-87.730000,41.970000,-87.740000,member


In [57]:
data["May"].dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [54]:
# the number of unique values in May
# unique values for specific columns
print("Column Name:          NUnique")
print(data['May'].nunique())
print(data['May'].rideable_type.unique())
print(data['May'].member_casual.unique())

Column Name:          NUnique
ride_id               604827
rideable_type              3
started_at            503683
ended_at              505259
start_station_name      1287
start_station_id        1250
end_station_name        1254
end_station_id          1210
start_lat             188591
start_lng             185410
end_lat                 4759
end_lng                 4762
member_casual              2
dtype: int64
['electric_bike' 'classic_bike' 'docked_bike']
['member' 'casual']


The count_entries method will do the following:
 * calculate number of entires in each file
 * calculate number of columns in each file
 * calculate the average and total number of entries across all files
 * write these outputs to a file
 * Option to remove duplicates

In [5]:
def count_entries(dataset, file_name, flag):
    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Month","No Of Entries","No Of Cols"])

        total = 0

        for month, df in dataset.items():
            if flag:
                df = df.drop_duplicates()
                
            entries = len(df)
            writer.writerow([month, entries,len(df.columns)])
            total += len(df)
            print(f"For {month}:{df.shape}")

        average = int(total/11)
        writer.writerow(["Total:", total])
        writer.writerow(["Average:", average])
        print(f"Average number of entries per file: {average}")
        print(f"Total number of entries across all files: {total}")


In [47]:
count_entries(data, "Original_BikeRides.csv", False)

For January:(190301, 13)
For February:(190445, 13)
For March:(258678, 13)
For April:(426590, 13)
For May:(604827, 13)
For June:(719618, 13)
For July:(767650, 13)
For August:(771693, 13)
For September:(666371, 13)
For October:(537113, 13)
For November:(362518, 13)
Average number of entries per file: 499618
Total number of entries across all files: 5495804


In [8]:
count_entries(data, "BikeRides_without_Duplicates.csv", True)

For January:(190301, 13)
For February:(190445, 13)
For March:(258678, 13)
For April:(426590, 13)
For May:(604827, 13)
For June:(719618, 13)
For July:(767650, 13)
For August:(771693, 13)
For September:(666371, 13)
For October:(537113, 13)
For November:(362518, 13)
Average number of entries per file: 499618
Total number of entries across all files: 5495804


From the above print out, it is clear that all the files have the same number of columns. So that is a good preliminary check on the consistency of the data across the files. And we can see that in total we have 5.5 Million entries, with an average of 500000 entries per month. All calculated numbers before and after removing duplicates are identical, so the original dataset did not have any duplicates.

Next we will look at the percentage of nulls across the columns:

In [6]:

def check_NAN(dataset, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)

        for month, df in dataset.items():
            if month == 'January':
                # if the month is January add the word "Month" to column names
                column_names = df.columns.insert(0, "Month")
                writer.writerow(column_names)
            
            percentage = df.isna().sum()*100/len(df)
            y = percentage.apply(lambda x: str(int(x))+"%" if x > 1 else ("< 1%" if x > 0 else str(0))).values
            values = np.insert(y, 0, month)
            print(values)
            writer.writerow(values)
        

In [73]:
check_NAN(data, "NaN_Percentages_Orig.csv")

['January' '0' '0' '0' '0' '0']
['February' '0' '0' '0' '0' '0']
['March' '0' '0' '0' '0' '0']
['April' '0' '0' '0' '0' '0']
['May' '0' '0' '0' '0' '0']
['June' '0' '0' '0' '0' '0']
['July' '0' '0' '0' '0' '0']
['August' '0' '0' '0' '0' '0']
['September' '0' '0' '0' '0' '0']
['October' '0' '0' '0' '0' '0']
['November' '0' '0' '0' '0' '0']


Drop the columns with NAN values, and convert the started and ended at to datetime.

In [7]:
# 1st phase of clean data with the ended_before_started column to be able to see and count the entries where this is True
def clean_data1(dataset):
    col_to_drop = ['ride_id', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng']
    clean_data = {}

    for month, df in dataset.items():
        #print("shape before",df.shape)
        clean_data[month] = df.drop(columns = col_to_drop) # drop columns
        #print("shape after",df.shape)
        #df.fillna('') # fill NaN
        #df.dropnna(subset = "Column Name", inplace=True) # drop rows where column_name has NaN

        # convert columns to datetime 
        clean_data[month]['started_at'] = pd.to_datetime(df['started_at'])
        clean_data[month]['ended_at']   = pd.to_datetime(df['ended_at'])
        clean_data[month]['ended_before_started'] = clean_data[month]['started_at'] > clean_data[month]['ended_at']
        
    return clean_data

In [8]:
cleaned_data1 = clean_data1(data)

In [266]:
cleaned_data1['January'].info()
cleaned_data1['January'].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190301 entries, 0 to 190300
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   rideable_type         190301 non-null  object        
 1   started_at            190301 non-null  datetime64[ns]
 2   ended_at              190301 non-null  datetime64[ns]
 3   member_casual         190301 non-null  object        
 4   ended_before_started  190301 non-null  bool          
dtypes: bool(1), datetime64[ns](2), object(2)
memory usage: 6.0+ MB


Unnamed: 0,rideable_type,started_at,ended_at,member_casual,ended_before_started
0,electric_bike,2023-01-21 20:05:42,2023-01-21 20:16:33,member,False
1,classic_bike,2023-01-10 15:37:36,2023-01-10 15:46:05,member,False
2,electric_bike,2023-01-02 07:51:57,2023-01-02 08:05:11,casual,False
3,classic_bike,2023-01-22 10:52:58,2023-01-22 11:01:44,member,False
4,classic_bike,2023-01-12 13:58:01,2023-01-12 14:13:20,member,False


In [267]:
df = cleaned_data1['May']
df.loc[df['ended_before_started'] == True]

Unnamed: 0,rideable_type,started_at,ended_at,member_casual,ended_before_started
8308,classic_bike,2023-05-29 17:34:21,2023-05-29 17:34:09,member,True
38552,electric_bike,2023-05-29 16:57:34,2023-05-29 16:57:27,casual,True
103546,electric_bike,2023-05-26 15:39:47,2023-05-26 15:38:17,member,True
103547,electric_bike,2023-05-26 15:38:53,2023-05-26 15:38:17,member,True
209340,classic_bike,2023-05-07 15:54:58,2023-05-07 15:54:47,casual,True
211708,classic_bike,2023-05-23 17:39:38,2023-05-23 17:39:35,casual,True
216859,classic_bike,2023-05-13 18:08:15,2023-05-13 18:08:09,member,True
336480,electric_bike,2023-05-29 11:31:41,2023-05-29 11:31:33,member,True
417351,classic_bike,2023-05-27 05:31:51,2023-05-27 05:31:37,member,True
456170,electric_bike,2023-05-30 07:40:55,2023-05-30 07:39:58,member,True


In [268]:
# calculates total number of entries where ended_before_started == True
total = [df['ended_before_started'].loc[df['ended_before_started'] == True].sum() for _, df in cleaned_data1.items()]    
print(total, sum(total)) 

[0, 1, 0, 4, 10, 7, 30, 60, 50, 36, 64] 262


In [21]:
#2nd version of clean data that will drop the rows where ended > started
def clean_data2(dataset):
    col_to_drop = ['ended_before_started']
    clean_data = {}

    for month, df in dataset.items():
        clean_data[month] = df[df['ended_before_started'] == False]
        clean_data[month] = clean_data[month].drop(columns = col_to_drop) 
        
    return clean_data

In [22]:
cleaned_data = clean_data2(cleaned_data1)

In [23]:
# check no of columns and entries after removing
count_entries(cleaned_data, "BikeRides_Cleaned.csv", False)

For January:(190301, 4)
For February:(190444, 4)
For March:(258678, 4)
For April:(426586, 4)
For May:(604817, 4)
For June:(719611, 4)
For July:(767620, 4)
For August:(771633, 4)
For September:(666321, 4)
For October:(537077, 4)
For November:(362454, 4)
Average number of entries per file: 499594
Total number of entries across all files: 5495542


In [272]:
check_NAN(cleaned_data, "NaN_Percentages_Clean.csv")

['January' '0' '0' '0' '0']
['February' '0' '0' '0' '0']
['March' '0' '0' '0' '0']
['April' '0' '0' '0' '0']
['May' '0' '0' '0' '0']
['June' '0' '0' '0' '0']
['July' '0' '0' '0' '0']
['August' '0' '0' '0' '0']
['September' '0' '0' '0' '0']
['October' '0' '0' '0' '0']
['November' '0' '0' '0' '0']


# Preparation:

In [118]:
# converts seconds and days from deltatime into HH:MM:SS format
def format_time(seconds, days):
    min, sec    = divmod(seconds, 60)
    hour, min   = divmod(min, 60)
    hour        += days*24
    return '%d:%02d:%02d' % (hour, min, sec)

In [124]:
df = cleaned_data['May']
df['ride_length'] = (df['ended_at'] - df['started_at'])#.dt.seconds
#val = df['ride_length'].mean()
val = df['ride_length'].max()
print(val)
print(format_time(val.seconds, val.days))
print(df['day_of_week'].mode()[0])

20 days 06:50:31
486:50:31
Tuesday


In [119]:
def prepare_data(dataset):
    prepped_data = dataset

    for month, df in dataset.items():
        prepped_data[month]['ride_length'] = (df['ended_at'] - df['started_at'])
        prepped_data[month]['day_of_week'] = df['started_at'].dt.day_name()
    
    return prepped_data

In [120]:
prepared_data = prepare_data(cleaned_data)

In [82]:
prepared_data['May'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 604817 entries, 0 to 604826
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype          
---  ------         --------------   -----          
 0   rideable_type  604817 non-null  object         
 1   started_at     604817 non-null  datetime64[ns] 
 2   ended_at       604817 non-null  datetime64[ns] 
 3   member_casual  604817 non-null  object         
 4   ride_length    604817 non-null  timedelta64[ns]
 5   day_of_week    604817 non-null  object         
dtypes: datetime64[ns](2), object(3), timedelta64[ns](1)
memory usage: 32.3+ MB


In [121]:
prepared_data['May']

Unnamed: 0,rideable_type,started_at,ended_at,member_casual,ride_length,day_of_week
0,electric_bike,2023-05-07 19:53:48,2023-05-07 19:58:32,member,0 days 00:04:44,Sunday
1,electric_bike,2023-05-06 18:54:08,2023-05-06 19:03:35,member,0 days 00:09:27,Saturday
2,electric_bike,2023-05-21 00:40:21,2023-05-21 00:44:36,member,0 days 00:04:15,Sunday
3,classic_bike,2023-05-10 16:47:01,2023-05-10 16:59:52,member,0 days 00:12:51,Wednesday
4,classic_bike,2023-05-09 18:30:34,2023-05-09 18:39:28,member,0 days 00:08:54,Tuesday
...,...,...,...,...,...,...
604822,electric_bike,2023-05-18 10:26:43,2023-05-18 10:48:00,member,0 days 00:21:17,Thursday
604823,electric_bike,2023-05-17 14:32:48,2023-05-17 14:45:37,member,0 days 00:12:49,Wednesday
604824,electric_bike,2023-05-17 07:59:29,2023-05-17 08:04:54,member,0 days 00:05:25,Wednesday
604825,electric_bike,2023-05-18 08:34:48,2023-05-18 08:38:40,member,0 days 00:03:52,Thursday


In [159]:
from statistics import mean

def statistics(dataset, file_name):
    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Month","Mean Ride Length","Max Ride Length", "Day of Week Mode"])

        meanlist, maxlist, modelist = [], [], [] 
        i = 0
        for month, df in dataset.items():
            meanlist.append(df['ride_length'].mean())
            maxlist.append(df['ride_length'].max())
            modelist.append(df['day_of_week'].mode()[0])
            writer.writerow([month, format_time(meanlist[i].seconds, meanlist[i].days), format_time(maxlist[i].seconds, maxlist[i].days), modelist[i]])
            print([month, format_time(meanlist[i].seconds, meanlist[i].days), format_time(maxlist[i].seconds, maxlist[i].days), modelist[i]])
            i += 1

        # we want to add the total average across different months
        #writer.writerow(["", format_time(meanlist.mean.seconds, meanlist[i].days), format_time(maxlist[i].seconds, maxlist[i].days), modelist[i]])
        

In [160]:
statistics(prepared_data, "Mean_Max_Mode.csv")

['January', '0:13:00', '560:03:44', 'Tuesday']
['February', '0:13:31', '314:25:46', 'Tuesday']
['March', '0:13:04', '280:08:04', 'Wednesday']
['April', '0:17:12', '306:35:29', 'Saturday']
['May', '0:19:02', '486:50:31', 'Tuesday']
['June', '0:19:59', '491:05:58', 'Friday']
['July', '0:21:44', '857:41:24', 'Saturday']
['August', '0:22:25', '1641:29:04', 'Wednesday']
['September', '0:17:52', '25:07:46', 'Saturday']
['October', '0:15:41', '24:59:57', 'Tuesday']
['November', '0:13:49', '25:00:25', 'Thursday']


TypeError: unsupported operand type(s) for +: 'int' and 'Timedelta'