In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import pickle

In [2]:
df = pd.read_csv('final_merged_data.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
df.dropna(inplace=True)

In [3]:
df.shape

(298946, 78)

In [4]:
df.head(5)

Unnamed: 0,last_reported,station_id,num_bikes_available,num_docks_available,is_installed,is_renting,is_returning,name,address,lat,...,min_humidity_quality_indicator,min_relative_humidity_percent,humidity_std_quality_indicator,relative_humidity_std_deviation,max_pressure_quality_indicator,max_barometric_pressure_hpa,min_pressure_quality_indicator,min_barometric_pressure_hpa,pressure_std_quality_indicator,barometric_pressure_std_deviation
0,2024-12-01 00:10:00,10,15,1,True,True,True,DAME STREET,Dame Street,53.344006,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
1,2024-12-01 00:10:00,100,17,8,True,True,True,HEUSTON BRIDGE (SOUTH),Heuston Bridge (South),53.347107,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
2,2024-12-01 00:10:00,109,20,9,True,True,True,BUCKINGHAM STREET LOWER,Buckingham Street Lower,53.353333,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
3,2024-12-01 00:10:00,11,1,29,True,True,True,EARLSFORT TERRACE,Earlsfort Terrace,53.334293,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
4,2024-12-01 00:10:00,114,4,36,True,True,True,WILTON TERRACE (PARK),Wilton Terrace (Park),53.333652,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083


In [5]:
columns = ['last_reported', 'station_id', 'num_bikes_available',
           'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
           'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
           'day', 'hour', 'minute']

sub_set = df[columns]
sub_set.head(5)


Unnamed: 0,last_reported,station_id,num_bikes_available,num_docks_available,is_installed,is_renting,is_returning,name,address,lat,lon,capacity,stno,year,month,day,hour,minute
0,2024-12-01 00:10:00,10,15,1,True,True,True,DAME STREET,Dame Street,53.344006,-6.266802,16,175,2024,12,1,0,10
1,2024-12-01 00:10:00,100,17,8,True,True,True,HEUSTON BRIDGE (SOUTH),Heuston Bridge (South),53.347107,-6.292041,25,175,2024,12,1,0,10
2,2024-12-01 00:10:00,109,20,9,True,True,True,BUCKINGHAM STREET LOWER,Buckingham Street Lower,53.353333,-6.249319,29,175,2024,12,1,0,10
3,2024-12-01 00:10:00,11,1,29,True,True,True,EARLSFORT TERRACE,Earlsfort Terrace,53.334293,-6.258503,30,175,2024,12,1,0,10
4,2024-12-01 00:10:00,114,4,36,True,True,True,WILTON TERRACE (PARK),Wilton Terrace (Park),53.333652,-6.248345,40,175,2024,12,1,0,10


In [6]:
df.columns

Index(['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temp_quality_indicator',
       'max_air_temperature_celsius', 'min_air_temp_quality_indicator',
       'min_air_temperature_celsius', 'air_temp_std_quality_indicator',
       'air_temperature_std_deviation', 'max_grass_temp_quality_indicator',
       'max_grass_temperature_celsius', 'min_grass_temp_quality_indicator',
       'min_grass_temperature_celsius', 'grass_temp_std_quality_indicator',
       'grass_temperature_std_deviation',
       'max_soil_temp_5cm_quality_indicator',
       'max_soil_temperature_5cm_celsius',
       'min_soil_temp_5cm_quality_indicator',
       'min_soil_temperature_5cm_celsius',
       'soil_temp_std_5cm_quality_indicator',
       'soil_temperature_std_deviation_5cm',
       'max_soil_temp_10cm_quality_in

In [7]:
df["last_reported"] = pd.to_datetime(df["last_reported"])
df["hour"] = df["last_reported"].dt.hour
df["day_of_week"] = df["last_reported"].dt.weekday


In [8]:
# columns_to_keep = ['last_reported', 'station_id', 'num_bikes_available',
#        'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
#        'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
#        'day', 'hour', 'minute','max_air_temperature_celsius','min_air_temperature_celsius',
#     'max_relative_humidity_percent','min_relative_humidity_percent','max_barometric_pressure_hpa','min_barometric_pressure_hpa','Weekday']
    
# df =df[columns_to_keep]

In [9]:
df.columns

Index(['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temp_quality_indicator',
       'max_air_temperature_celsius', 'min_air_temp_quality_indicator',
       'min_air_temperature_celsius', 'air_temp_std_quality_indicator',
       'air_temperature_std_deviation', 'max_grass_temp_quality_indicator',
       'max_grass_temperature_celsius', 'min_grass_temp_quality_indicator',
       'min_grass_temperature_celsius', 'grass_temp_std_quality_indicator',
       'grass_temperature_std_deviation',
       'max_soil_temp_5cm_quality_indicator',
       'max_soil_temperature_5cm_celsius',
       'min_soil_temp_5cm_quality_indicator',
       'min_soil_temperature_5cm_celsius',
       'soil_temp_std_5cm_quality_indicator',
       'soil_temperature_std_deviation_5cm',
       'max_soil_temp_10cm_quality_in

In [10]:
df["wind_speed"] = 0.0 
df["precipitation"] = 0.0
features = ["station_id",
    "max_air_temperature_celsius",
    "max_relative_humidity_percent",
    "max_barometric_pressure_hpa",
    "wind_speed",
    "precipitation",
    "hour",
    "day_of_week"]
target = ['num_bikes_available']
X = df[features]
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a linear regression model
# model = LinearRegression()
model = RandomForestRegressor(n_estimators=30, max_depth=30, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

  return fit_method(estimator, *args, **kwargs)


Mean Absolute Error: 1.1180305027495117
R² Score: 0.9568825185992972


In [11]:
y_test.shape

(89684, 1)

In [12]:
y_pred.shape

(89684,)

In [13]:
result_df = pd.DataFrame({
    'Actual': y_test.squeeze(),  # Remove extra dimension
    'Predicted': y_pred.squeeze()
})
result_df.head(5)

Unnamed: 0,Actual,Predicted
297259,31,30.466667
14520,23,25.365
113311,4,2.573622
212766,1,1.966667
86232,0,0.2


In [14]:
# Save the model to a .pkl file

import joblib

model_filename = "bike_availability_model_small.pkl"
with open(model_filename, "wb") as file:
    # pickle.dump(model, file)
    joblib.dump(model, file, compress=5)
print(f"Model saved to {model_filename}")

Model saved to bike_availability_model_small.pkl
