In [8]:
import pandas as pd
import glob
import os

# Define the path to the raw data
data_path = "data/raw"

# Step 1: Load all CSV files with low_memory=False
all_files = glob.glob(os.path.join(data_path, "*.csv"))
df_list = [pd.read_csv(file, low_memory=False) for file in all_files]
df = pd.concat(df_list, ignore_index=True)

# Step 2: Drop irrelevant columns
df = df.drop(columns=['Unnamed: 0', 'rideable_type_duplicate_column_name_1'], errors='ignore')

# Step 3: Clean and preprocess
# Convert timestamps to datetime with mixed format support
df['started_at'] = pd.to_datetime(df['started_at'], format='mixed')
df['ended_at'] = pd.to_datetime(df['ended_at'], format='mixed')

# Drop rows with missing start_station_name
df = df.dropna(subset=['start_station_name'])

# Step 4: Aggregate by start station and hour
df['start_hour'] = df['started_at'].dt.floor('H')  # Round to nearest hour
trips_per_station_hour = df.groupby(['start_station_name', 'start_hour']).size().reset_index(name='trip_count')

# Step 5: Select top 3 stations by total trips
total_trips_per_station = df.groupby('start_station_name').size().reset_index(name='total_trips')
top_3_stations = total_trips_per_station.sort_values(by='total_trips', ascending=False).head(3)['start_station_name'].tolist()
print("Top 3 stations:", top_3_stations)

# Step 6: Filter data for top 3 stations and create a new DataFrame
trips_top_3 = trips_per_station_hour[trips_per_station_hour['start_station_name'].isin(top_3_stations)].copy()

# Step 7: Ensure data types are appropriate for Hopsworks
trips_top_3['start_station_name'] = trips_top_3['start_station_name'].astype(str)
trips_top_3['start_hour'] = trips_top_3['start_hour'].astype(str)  # Convert to string for Hopsworks compatibility
trips_top_3['trip_count'] = trips_top_3['trip_count'].astype(int)

# Step 8: Save the processed data locally
trips_top_3.to_csv('processed_trips_top_3.csv', index=False)

# Inspect the processed data
print("\nProcessed data (first 5 rows):")
print(trips_top_3.head())

Top 3 stations: ['W 21 St & 6 Ave', 'Broadway & W 58 St', 'University Pl & E 14 St']

Processed data (first 5 rows):
         start_station_name           start_hour  trip_count
4760911  Broadway & W 58 St  2022-12-31 10:00:00           1
4760912  Broadway & W 58 St  2022-12-31 12:00:00           1
4760913  Broadway & W 58 St  2022-12-31 20:00:00           1
4760914  Broadway & W 58 St  2023-01-01 00:00:00           6
4760915  Broadway & W 58 St  2023-01-01 01:00:00           2


In [7]:
import hopsworks

# Test connection
try:
    print("Attempting to connect to Hopsworks...")
    project = hopsworks.login(
        host='c.app.hopsworks.ai',
        project='CitiBikeTrip',  # Replace with your exact project name
        api_key_value='DIrr083Keer9GlOI.3FUt2ZiErZwV9gDGP0i5fCINcaNopLE4YfPoswUh3HrGaepEZyMdO5VQkxFsl4d0'
    )
    print("Connection established successfully.")
    fs = project.get_feature_store()
    print("Feature Store retrieved successfully.")
except Exception as e:
    print(f"Connection failed: {str(e)}")
    raise

TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'