First the imports

In [68]:
import os 
import shutil # to copy files across directories

import pandas as pd
import numpy as np
import csv
import calendar

If there is no CleanedData directory, then create one

In [2]:
# creates a directory CleanedData if it does not exist 
def create_cleaning_directory(clean_dir):

    if not os.path.exists(clean_dir):
        os.makedirs(clean_dir)
        print(f"Directory '{clean_dir}' created successfully.")
    else:
        print(f"Directory '{clean_dir}' already exists.")

In [None]:
cleaning_dir = "../CleanedData"
create_cleaning_directory(cleaning_dir)

Next, we want to have the data in an accessible form. This is done by reading the data into a dictionary of panada dataframes. Each element in the dictionary has a key (which is the name of the month the data was collected in) and a value (the csv entries in a panda dataframe) 

In [3]:
# returns a dictionary containing the data from the .csv files
def read_data(dir):
    dfs = {}
    files = os.listdir(dir)

    # filter files by .csv bec some are google sheets
    # extract the number of the month, then sort by month so that they are then stored in order of month
    csv_files = sorted([int(file[4:6]) for file in files if file.endswith('.csv')])
    
    for csv_file in csv_files:
        # reconstruct the name of file bec we extracted the month number to sort them by month
        reconstructed_name = '2023'+str(csv_file)+'-divvy-tripdata.csv' if csv_file >= 10 else '20230'+str(csv_file)+'-divvy-tripdata.csv'
        f = os.path.join(dir, reconstructed_name) 
        
        # get month name
        month = calendar.month_name[csv_file]
        
        # read csv file into dataframe
        dfs[month] = pd.read_csv(f)

    return dfs

In [4]:
original_dir = "../../BikeShareData/OriginalData"
data = read_data(original_dir)

Here we can see the number of data files (a check that all eleven months were read into the dictionary), and names of the months. We can also take a look at the first 5 entries for a couple of months:

In [30]:

no_of_files = len(data)
print(no_of_files)
month_names = data.keys()
print(month_names)
#data["January"]
#data["February"]
#data["March"]
#data["April"]
data["May"].info()
#data["June"]
#data["July"]
#data["August"]
#data["September"]
#data["October"]
#data["November"]

11
dict_keys(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November'])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604827 entries, 0 to 604826
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             604827 non-null  object 
 1   rideable_type       604827 non-null  object 
 2   started_at          604827 non-null  object 
 3   ended_at            604827 non-null  object 
 4   start_station_name  515587 non-null  object 
 5   start_station_id    515587 non-null  object 
 6   end_station_name    509560 non-null  object 
 7   end_station_id      509560 non-null  object 
 8   start_lat           604827 non-null  float64
 9   start_lng           604827 non-null  float64
 10  end_lat             604117 non-null  float64
 11  end_lng             604117 non-null  float64
 12  member_casual       604827 non-null  object 
dtypes: float6

The check_entries method will do the following:
 * check whether or not to remove duplicates
 * calculate number of entires in each file
 * calculate number of columns in each file
 * calculate the average and total number of entries across all files
 * write these outputs to a file

In [6]:
def count_entries(file_name, flag):
    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Month","No Of Entries","No Of Cols"])

        total = 0

        for month, df in data.items():
            if flag:
                df = df.drop_duplicates()
                
            entries = len(df)
            writer.writerow([month, entries,len(df.columns)])
            total += len(df)
            print(f"For {month}:{df.shape}")

        average = int(total/11)
        writer.writerow(["Total:", total])
        writer.writerow(["Average:", average])
        print(f"Average number of entries per file: {average}")
        print(f"Total number of entries across all files: {total}")


In [7]:
count_entries("Original_BikeRides.csv", False)

For January:(190301, 13)
For February:(190445, 13)
For March:(258678, 13)
For April:(426590, 13)
For May:(604827, 13)
For June:(719618, 13)
For July:(767650, 13)
For August:(771693, 13)
For September:(666371, 13)
For October:(537113, 13)
For November:(362518, 13)
Average number of entries per file: 499618
Total number of entries across all files: 5495804


In [8]:
count_entries("BikeRides_without_Duplicates.csv", True)

For January:(190301, 13)
For February:(190445, 13)
For March:(258678, 13)
For April:(426590, 13)
For May:(604827, 13)
For June:(719618, 13)
For July:(767650, 13)
For August:(771693, 13)
For September:(666371, 13)
For October:(537113, 13)
For November:(362518, 13)
Average number of entries per file: 499618
Total number of entries across all files: 5495804


From the above print out, it is clear that all the files have the same number of columns. So that is a good preliminary check on the consistency of the data across the files. And we can see that in total we have 5.5 Million entries, with an average of 500000 entries per month. All calculated numbers before and after removing duplicates are identical, so the original dataset did not have any duplicates.

Next we will look at the percentage of nulls across the columns:

In [102]:

def check_NAN():
    with open("NaN_Percentages.csv", 'w', newline='') as file:
        writer = csv.writer(file)

        for month, df in data.items():
            #df.drop(columns = "name of column", inplace=True) # drop column
            #df.fillna('') # fill NaN
            #df.dropnna(subset = "Column Name", inplace=True) # drop rows where column_name has NaN

            if month == 'January':
                column_names = df.columns.insert(0, "Month")
                #print(column_names)
                writer.writerow(column_names)
            
            percentage = df.isna().sum()*100/len(df)
            #y = percentage.apply(lambda x: str(int(x))+"%" if x > 1 else 0).values
            y = percentage.apply(lambda x: str(int(x))+"%" if x > 1 else ("< 1%" if x > 0 else 0)).values
            values = np.insert(y, 0, month)
            print(values)
            writer.writerow(values)
        

In [103]:
check_NAN()

['January' 0 0 0 0 '14%' '14%' '14%' '14%' 0 0 '< 1%' '< 1%' 0]
['February' 0 0 0 0 '13%' '13%' '14%' '14%' 0 0 '< 1%' '< 1%' 0]
['March' 0 0 0 0 '13%' '13%' '14%' '14%' 0 0 '< 1%' '< 1%' 0]
['April' 0 0 0 0 '14%' '14%' '16%' '16%' 0 0 '< 1%' '< 1%' 0]
['May' 0 0 0 0 '14%' '14%' '15%' '15%' 0 0 '< 1%' '< 1%' 0]
['June' 0 0 0 0 '16%' '16%' '17%' '17%' 0 0 '< 1%' '< 1%' 0]
['July' 0 0 0 0 '16%' '16%' '16%' '16%' 0 0 '< 1%' '< 1%' 0]
['August' 0 0 0 0 '15%' '15%' '16%' '16%' 0 0 '< 1%' '< 1%' 0]
['September' 0 0 0 0 '15%' '15%' '16%' '16%' 0 0 '< 1%' '< 1%' 0]
['October' 0 0 0 0 '15%' '15%' '16%' '16%' 0 0 '< 1%' '< 1%' 0]
['November' 0 0 0 0 '15%' '15%' '15%' '15%' 0 0 '< 1%' '< 1%' 0]


What are the unique values in each column

In [106]:
data['January'].nunique()

ride_id               190301
rideable_type              3
started_at            178872
ended_at              179025
start_station_name       964
start_station_id         944
end_station_name         962
end_station_id           942
start_lat              63345
start_lng              63144
end_lat                  852
end_lng                  839
member_casual              2
dtype: int64