In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import pickle

In [2]:
df = pd.read_csv('final_merged_data.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
df.shape

(298946, 78)

In [4]:
df.head(5)

Unnamed: 0,last_reported,station_id,num_bikes_available,num_docks_available,is_installed,is_renting,is_returning,name,address,lat,...,min_humidity_quality_indicator,min_relative_humidity_percent,humidity_std_quality_indicator,relative_humidity_std_deviation,max_pressure_quality_indicator,max_barometric_pressure_hpa,min_pressure_quality_indicator,min_barometric_pressure_hpa,pressure_std_quality_indicator,barometric_pressure_std_deviation
0,2024-12-01 00:10:00,10,15,1,True,True,True,DAME STREET,Dame Street,53.344006,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
1,2024-12-01 00:10:00,100,17,8,True,True,True,HEUSTON BRIDGE (SOUTH),Heuston Bridge (South),53.347107,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
2,2024-12-01 00:10:00,109,20,9,True,True,True,BUCKINGHAM STREET LOWER,Buckingham Street Lower,53.353333,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
3,2024-12-01 00:10:00,11,1,29,True,True,True,EARLSFORT TERRACE,Earlsfort Terrace,53.334293,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083
4,2024-12-01 00:10:00,114,4,36,True,True,True,WILTON TERRACE (PARK),Wilton Terrace (Park),53.333652,...,0,83.2,0,0.284,0,1002.56,0,1002.26,0,0.083


In [5]:
columns = ['last_reported', 'station_id', 'num_bikes_available',
           'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
           'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
           'day', 'hour', 'minute']

sub_set = df[columns]
sub_set.head(5)


Unnamed: 0,last_reported,station_id,num_bikes_available,num_docks_available,is_installed,is_renting,is_returning,name,address,lat,lon,capacity,stno,year,month,day,hour,minute
0,2024-12-01 00:10:00,10,15,1,True,True,True,DAME STREET,Dame Street,53.344006,-6.266802,16,175,2024,12,1,0,10
1,2024-12-01 00:10:00,100,17,8,True,True,True,HEUSTON BRIDGE (SOUTH),Heuston Bridge (South),53.347107,-6.292041,25,175,2024,12,1,0,10
2,2024-12-01 00:10:00,109,20,9,True,True,True,BUCKINGHAM STREET LOWER,Buckingham Street Lower,53.353333,-6.249319,29,175,2024,12,1,0,10
3,2024-12-01 00:10:00,11,1,29,True,True,True,EARLSFORT TERRACE,Earlsfort Terrace,53.334293,-6.258503,30,175,2024,12,1,0,10
4,2024-12-01 00:10:00,114,4,36,True,True,True,WILTON TERRACE (PARK),Wilton Terrace (Park),53.333652,-6.248345,40,175,2024,12,1,0,10


In [6]:
df.columns

Index(['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temp_quality_indicator',
       'max_air_temperature_celsius', 'min_air_temp_quality_indicator',
       'min_air_temperature_celsius', 'air_temp_std_quality_indicator',
       'air_temperature_std_deviation', 'max_grass_temp_quality_indicator',
       'max_grass_temperature_celsius', 'min_grass_temp_quality_indicator',
       'min_grass_temperature_celsius', 'grass_temp_std_quality_indicator',
       'grass_temperature_std_deviation',
       'max_soil_temp_5cm_quality_indicator',
       'max_soil_temperature_5cm_celsius',
       'min_soil_temp_5cm_quality_indicator',
       'min_soil_temperature_5cm_celsius',
       'soil_temp_std_5cm_quality_indicator',
       'soil_temperature_std_deviation_5cm',
       'max_soil_temp_10cm_quality_in

In [7]:
df.dtypes
df['last_reported'] = pd.to_datetime(df['last_reported'])

In [8]:
df['Weekday_str'] = df['last_reported'].dt.day_name()



# Map weekday names to numbers (Mon=1, ..., Sun=7)
weekday_map = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}

# Add the numeric representation of the weekday
df['Weekday'] = df['Weekday_str'].map(weekday_map)



df.columns

Index(['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temp_quality_indicator',
       'max_air_temperature_celsius', 'min_air_temp_quality_indicator',
       'min_air_temperature_celsius', 'air_temp_std_quality_indicator',
       'air_temperature_std_deviation', 'max_grass_temp_quality_indicator',
       'max_grass_temperature_celsius', 'min_grass_temp_quality_indicator',
       'min_grass_temperature_celsius', 'grass_temp_std_quality_indicator',
       'grass_temperature_std_deviation',
       'max_soil_temp_5cm_quality_indicator',
       'max_soil_temperature_5cm_celsius',
       'min_soil_temp_5cm_quality_indicator',
       'min_soil_temperature_5cm_celsius',
       'soil_temp_std_5cm_quality_indicator',
       'soil_temperature_std_deviation_5cm',
       'max_soil_temp_10cm_quality_in

In [9]:
columns_to_keep = ['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute','max_air_temperature_celsius','min_air_temperature_celsius',
    'max_relative_humidity_percent','min_relative_humidity_percent','max_barometric_pressure_hpa','min_barometric_pressure_hpa','Weekday']
    
df =df[columns_to_keep]

In [10]:
df.columns

Index(['last_reported', 'station_id', 'num_bikes_available',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning',
       'name', 'address', 'lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temperature_celsius',
       'min_air_temperature_celsius', 'max_relative_humidity_percent',
       'min_relative_humidity_percent', 'max_barometric_pressure_hpa',
       'min_barometric_pressure_hpa', 'Weekday'],
      dtype='object')

In [11]:
features = ['station_id',
       'num_docks_available', 'is_installed', 'is_renting', 'is_returning','lat', 'lon', 'capacity', 'stno', 'year', 'month',
       'day', 'hour', 'minute', 'max_air_temperature_celsius',
       'min_air_temperature_celsius', 'max_relative_humidity_percent',
       'min_relative_humidity_percent', 'max_barometric_pressure_hpa',
       'min_barometric_pressure_hpa', 'Weekday']
target = ['num_bikes_available']

In [12]:
X = df[features]
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 1.1321464544734214
R² Score: 0.8709444724862798


In [13]:
# Display model coefficients
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")
print(f"Intercept: {model.intercept_}")


Model Coefficients:
station_id: [ 3.07585310e-04 -9.16617678e-01  3.97459843e-13  2.02576844e-12
  2.88657986e-15  2.43916587e+01  2.31725415e+00  8.90704653e-01
 -3.55271368e-15  1.40776280e-13 -8.88178420e-15  2.13309942e-03
 -1.74692694e-02 -4.11479132e-04  8.84235337e-02 -9.61454915e-02
 -4.79345158e-03  5.39912757e-03 -1.04660265e-01  1.02758490e-01
  5.62701219e-03]
Intercept: [-1283.12924712]


In [14]:
y_test.shape

(89684, 1)

In [15]:
y_pred.shape

(89684, 1)

In [16]:
result_df = pd.DataFrame({
    'Actual': y_test.squeeze(),  # Remove extra dimension
    'Predicted': y_pred.squeeze()
})
result_df.head(5)

Unnamed: 0,Actual,Predicted
297259,31,28.610616
14520,23,21.68192
113311,4,4.005694
212766,1,2.021628
86232,0,0.354279
