In [1]:
import pandas as pd
import  os
from datetime import datetime
import sqlite3
from dateutil import parser

cleaning_log = {
    'initial_row_count': None,
    'rows_removed_negative_duration': 0,
    'rows_removed_over_24h': 0,
    'final_row_count': None,
    'pct_removed': None,
    'output_files_saved': []
}

In [2]:
raw_data_path = '../data/raw/'
all_files = [os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if f.endswith('.csv')]
print("Reading and combining all CSV files...")

# Combine all CSVs
df_list = [pd.read_csv(file) for file in all_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Log initial row count
cleaning_log['initial_row_count'] = combined_df.shape[0]
print(f"Successfully combined {len(all_files)} CSV files into DataFrame with {cleaning_log['initial_row_count']:,} rows.")


def safe_parse(x):
    try:
        return parser.parse(str(x))
    except:
        return pd.NaT


Reading and combining all CSV files...
Successfully combined 12 CSV files into DataFrame with 5,860,568 rows.


In [3]:
checkpoint_path = '../data/clean/01_combined_raw.parquet'
combined_df.to_parquet(checkpoint_path, index=False, engine='pyarrow')
cleaning_log['output_files_saved'].append(checkpoint_path)
print(f'Raw checkpoint saved to {checkpoint_path}.')

Raw checkpoint saved to ../data/clean/01_combined_raw.parquet.


In [4]:
df_clean = combined_df.copy()
for col in ['started_at', 'ended_at']:
    df_clean[col] = df_clean[col].apply(safe_parse)

# Report how many failed to parse
print("Total rows:", len(df_clean))
print("Started_at NaT:", df_clean['started_at'].isna().sum())
print("Ended_at NaT:", df_clean['ended_at'].isna().sum())

# 🔎 If failures exist, show a sample
mask_bad = df_clean['started_at'].isna() | df_clean['ended_at'].isna()
if mask_bad.sum() > 0:
    print(f"⚠️ Bad rows detected after parsing: {mask_bad.sum():,}")
    print(df_clean.loc[mask_bad, ['started_at', 'ended_at']].head(10))

Total rows: 5860568
Started_at NaT: 0
Ended_at NaT: 0


In [5]:
df_clean['ride_length_min'] =  (df_clean['ended_at'] - df_clean['started_at']).dt.total_seconds() / 60

df_clean['days_of _week_num'] = df_clean['started_at'].dt.day_of_week
df_clean['day_of_week'] = df_clean['started_at'].dt.day_name()

df_clean['month_num'] = df_clean['started_at'].dt.month
df_clean['month'] = df_clean['started_at'].dt.month_name()

df_clean['hour'] = df_clean['started_at'].dt.hour
df_clean['is_weekend'] = df_clean['days_of _week_num'].isin([5,6])

In [6]:
initial_rows = df_clean.shape[0]

neg_mask = df_clean['ride_length_min'] <= 0
cleaning_log['rows_removed_negative_duration'] = neg_mask.sum()
df_clean = df_clean[~neg_mask]

long_mask = df_clean['ride_length_min'] > 1440
cleaning_log['rows_removed_over_24h'] = long_mask.sum()
df_clean = df_clean[~long_mask]


cleaning_log['final_row_count'] = df_clean.shape[0]
cleaning_log['pct_removed'] = ((initial_rows - cleaning_log['final_row_count']) / initial_rows) * 100

print(f"Data Cleaning Complete. {cleaning_log['pct_removed']:.2f}% of data removed.")

Data Cleaning Complete. 0.14% of data removed.


In [7]:
clean_data_path = '../data/clean/trips_clean.parquet'
df_clean.to_parquet(clean_data_path, index=False, engine='pyarrow')
cleaning_log['output_files_saved'].append(clean_data_path)
print(f'Master cleaned data saved to {clean_data_path}.')

Master cleaned data saved to ../data/clean/trips_clean.parquet.


In [8]:

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f'../data/clean/backups/trips_clean_{timestamp}.parquet'
os.makedirs(os.path.dirname(backup_path), exist_ok=True) # Creates the /backups folder if it doesn't exist
df_clean.to_parquet(backup_path, index=False)
cleaning_log['output_files_saved'].append(backup_path)
print(f"Timestamped backup saved: {backup_path}")

Timestamped backup saved: ../data/clean/backups/trips_clean_20250906_143910.parquet


In [9]:
log_content = f"""
# Data Cleaning Log
- Date & Time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
- Initial Number of Rides: {cleaning_log['initial_row_count']:,}
- Rides Removed (Negative Duration): {cleaning_log['rows_removed_negative_duration']:,}
- Rides Removed (>24 hours): {cleaning_log['rows_removed_over_24h']:,}
- Final Number of Rides: {cleaning_log['final_row_count']:,}
- Percentage of Data Removed: {cleaning_log['pct_removed']:.2f}%
- **Output Files Created:**
"""
for file in cleaning_log['output_files_saved']:
    log_content += f"  - `{file}`\n"

log_file_path = './data_cleaning_log.md'
with open(log_file_path, 'w') as f:
    f.write(log_content)
print(f"Cleaning log saved to: {log_file_path}")

print(log_content)

Cleaning log saved to: ./data_cleaning_log.md

# Data Cleaning Log
- Date & Time: 2025-09-06 14:39:18
- Initial Number of Rides: 5,860,568
- Rides Removed (Negative Duration): 723
- Rides Removed (>24 hours): 7,596
- Final Number of Rides: 5,852,249
- Percentage of Data Removed: 0.14%
- **Output Files Created:**
  - `../data/clean/01_combined_raw.parquet`
  - `../data/clean/trips_clean.parquet`
  - `../data/clean/backups/trips_clean_20250906_143910.parquet`



In [10]:

db_path = '../data/clean/cyclistic.db'
conn = sqlite3.connect(db_path)
print(f"Database connected: {db_path}")

df_clean.to_sql('cyclistic_trips', conn, if_exists='replace', index=False)
print("Data successfully loaded into SQL table 'cyclistic_trips'.")


conn.close()
print("Database connection closed.")

Database connected: ../data/clean/cyclistic.db
Data successfully loaded into SQL table 'cyclistic_trips'.
Database connection closed.


In [11]:
df_clean

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length_min,days_of _week_num,day_of_week,month_num,month,hour,is_weekend
0,C1D650626C8C899A,electric_bike,2024-01-12 15:30:27.000,2024-01-12 15:37:59.000,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.903267,-87.634737,41.889177,-87.638506,member,7.533333,4,Friday,1,January,15,False
1,EECD38BDB25BFCB0,electric_bike,2024-01-08 15:45:46.000,2024-01-08 15:52:59.000,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.902937,-87.634440,41.889177,-87.638506,member,7.216667,0,Monday,1,January,15,False
2,F4A9CE78061F17F7,electric_bike,2024-01-27 12:27:19.000,2024-01-27 12:35:19.000,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.902951,-87.634470,41.889177,-87.638506,member,8.000000,5,Saturday,1,January,12,True
3,0A0D9E15EE50B171,classic_bike,2024-01-29 16:26:17.000,2024-01-29 16:56:06.000,Wells St & Randolph St,TA1305000030,Larrabee St & Webster Ave,13193,41.884295,-87.633963,41.921822,-87.644140,member,29.816667,0,Monday,1,January,16,False
4,33FFC9805E3EFF9A,classic_bike,2024-01-31 05:43:23.000,2024-01-31 06:09:35.000,Lincoln Ave & Waveland Ave,13253,Kingsbury St & Kinzie St,KA1503000043,41.948797,-87.675278,41.889177,-87.638506,member,26.200000,2,Wednesday,1,January,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860563,BD56BA20F42E4794,electric_bike,2024-12-11 08:23:46.564,2024-12-11 08:37:34.532,Clybourn Ave & Division St,TA1307000115,,,41.904634,-87.640518,41.880000,-87.630000,member,13.799467,2,Wednesday,12,December,8,False
5860564,3074643A6B60B300,electric_bike,2024-12-09 12:26:15.677,2024-12-09 12:37:32.712,Canal St & Jackson Blvd,13138,,,41.878125,-87.639968,41.900000,-87.620000,member,11.283917,0,Monday,12,December,12,False
5860565,15602635C5DF484E,electric_bike,2024-12-31 17:10:03.113,2024-12-31 17:17:21.838,Albany Ave & Bloomingdale Ave,15655,California Ave & Milwaukee Ave,13084,41.914027,-87.705126,41.922695,-87.697153,member,7.312083,1,Tuesday,12,December,17,False
5860566,F15ABBA961560B75,electric_bike,2024-12-01 14:39:47.216,2024-12-01 14:45:21.268,Albany Ave & Bloomingdale Ave,15655,California Ave & Milwaukee Ave,13084,41.914003,-87.705099,41.922695,-87.697153,member,5.567533,6,Sunday,12,December,14,True


In [12]:
combined_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,C1D650626C8C899A,electric_bike,2024-01-12 15:30:27,2024-01-12 15:37:59,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.903267,-87.634737,41.889177,-87.638506,member
1,EECD38BDB25BFCB0,electric_bike,2024-01-08 15:45:46,2024-01-08 15:52:59,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.902937,-87.634440,41.889177,-87.638506,member
2,F4A9CE78061F17F7,electric_bike,2024-01-27 12:27:19,2024-01-27 12:35:19,Wells St & Elm St,KA1504000135,Kingsbury St & Kinzie St,KA1503000043,41.902951,-87.634470,41.889177,-87.638506,member
3,0A0D9E15EE50B171,classic_bike,2024-01-29 16:26:17,2024-01-29 16:56:06,Wells St & Randolph St,TA1305000030,Larrabee St & Webster Ave,13193,41.884295,-87.633963,41.921822,-87.644140,member
4,33FFC9805E3EFF9A,classic_bike,2024-01-31 05:43:23,2024-01-31 06:09:35,Lincoln Ave & Waveland Ave,13253,Kingsbury St & Kinzie St,KA1503000043,41.948797,-87.675278,41.889177,-87.638506,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5860563,BD56BA20F42E4794,electric_bike,2024-12-11 08:23:46.564,2024-12-11 08:37:34.532,Clybourn Ave & Division St,TA1307000115,,,41.904634,-87.640518,41.880000,-87.630000,member
5860564,3074643A6B60B300,electric_bike,2024-12-09 12:26:15.677,2024-12-09 12:37:32.712,Canal St & Jackson Blvd,13138,,,41.878125,-87.639968,41.900000,-87.620000,member
5860565,15602635C5DF484E,electric_bike,2024-12-31 17:10:03.113,2024-12-31 17:17:21.838,Albany Ave & Bloomingdale Ave,15655,California Ave & Milwaukee Ave,13084,41.914027,-87.705126,41.922695,-87.697153,member
5860566,F15ABBA961560B75,electric_bike,2024-12-01 14:39:47.216,2024-12-01 14:45:21.268,Albany Ave & Bloomingdale Ave,15655,California Ave & Milwaukee Ave,13084,41.914003,-87.705099,41.922695,-87.697153,member


In [13]:
print(combined_df['start_station_name'].isna().sum(),'/',len(combined_df))

1073951 / 5860568


In [15]:
df_clean.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'ride_length_min', 'days_of _week_num', 'day_of_week',
       'month_num', 'month', 'hour', 'is_weekend'],
      dtype='object')