In [1]:

import pandas as pd
from datetime import datetime

from tqdm.notebook import tqdm

import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [123]:
df = pd.read_csv('municipality_bus_utilization.csv')

### Datasets
Adding date filter to separate the datasets

In [124]:
test_data_start = pd.to_datetime('2017-08-7')#'2017-08-05'
test_data_end = pd.to_datetime('2017-08-19')

train_data_start=pd.to_datetime('2017-06-05')

df['timestamp'] = pd.to_datetime(df['timestamp'])
test_data = df[(df['timestamp'] >= test_data_start)]
print('Test Dataset',len(test_data))
display(test_data)

train_data = df[(df['timestamp'] < test_data_start) & (df['timestamp'] > train_data_start)]
print('Train Dataset',len(train_data))
display(train_data)

Test Dataset 2320


Unnamed: 0,timestamp,municipality_id,usage,total_capacity
10750,2017-08-07 07:59:16,1,223,397
10751,2017-08-07 07:59:16,4,1352,3893
10752,2017-08-07 07:59:16,2,271,697
10753,2017-08-07 07:59:16,3,655,1930
10754,2017-08-07 07:59:16,7,686,2019
...,...,...,...,...
13065,2017-08-19 16:30:35,2,548,697
13066,2017-08-19 16:30:35,8,1193,2947
13067,2017-08-19 16:30:35,7,1354,2019
13068,2017-08-19 16:30:35,6,1680,3113


Train Dataset 10570


Unnamed: 0,timestamp,municipality_id,usage,total_capacity
180,2017-06-05 07:57:17,5,63,587
181,2017-06-05 07:57:17,9,462,1332
182,2017-06-05 07:57:17,1,155,397
183,2017-06-05 07:57:17,2,291,697
184,2017-06-05 07:57:17,4,1113,3893
...,...,...,...,...
10745,2017-08-06 16:29:16,6,868,3113
10746,2017-08-06 16:29:16,7,1111,2019
10747,2017-08-06 16:29:16,0,500,2813
10748,2017-08-06 16:29:16,9,623,1332


## Creating new dataset with better inputs
<br>1_ Each week has its own id
<br>2_ Each day of the week its own id
<br>3_ Time variable (hours and minutes) converted into single variable (hour*60+minute)

In [125]:
def create_dataframe(df):
    new_dataframe=[]
    first_day=False
    for row in df.iloc:
        time=row['timestamp']
        time_tuple=time.timetuple()
        #HOUR
        main_time = time.timetuple()[3]*60+time.timetuple()[4]
        #Week
        dt =time - datetime(*train_data_start.timetuple()[:3])
        week=int(dt.days/7)
        new_dataframe.append({
            "week":int(week),"week_day":int(time.weekday()),"time":int(main_time),
            "mid":int(row['municipality_id']),
            "usage":int(row['usage']),"capacity":int(row['total_capacity'])
        }) 
    return pd.DataFrame(new_dataframe)


train_df = create_dataframe(train_data)
test_df = create_dataframe(test_data)
display(train_df)
display(test_df)
    

Unnamed: 0,week,week_day,time,mid,usage,capacity
0,0,0,477,5,63,587
1,0,0,477,9,462,1332
2,0,0,477,1,155,397
3,0,0,477,2,291,697
4,0,0,477,4,1113,3893
...,...,...,...,...,...,...
10565,8,6,989,6,868,3113
10566,8,6,989,7,1111,2019
10567,8,6,989,0,500,2813
10568,8,6,989,9,623,1332


Unnamed: 0,week,week_day,time,mid,usage,capacity
0,9,0,479,1,223,397
1,9,0,479,4,1352,3893
2,9,0,479,2,271,697
3,9,0,479,3,655,1930
4,9,0,479,7,686,2019
...,...,...,...,...,...,...
2315,10,5,990,2,548,697
2316,10,5,990,8,1193,2947
2317,10,5,990,7,1354,2019
2318,10,5,990,6,1680,3113


### Separating data 
inputs & outputs

In [126]:
X_train = train_df.drop(['usage'], axis=1)  # Replace 'target_column' with the name of your target column
y_train = train_df['usage']
X_test = test_df.drop(['usage'], axis=1)  # Replace 'target_column' with the name of your target column
y_test = test_df['usage']

First i tried Random Forest algorithm to see its accuracy

In [127]:
model = RandomForestRegressor()  # Create an instance of the SVC model
model.fit(X_train.values, y_train.values)  # Train the model using the training data

RandomForestRegressor()

In [128]:
accuracy = model.score(X_test.values, y_test.values)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8288468518288392


Then i tried GradientBoosting and i got better accuracy

In [129]:
model = GradientBoostingRegressor()  # Create an instance of the SVC model
model.fit(X_train.values, y_train.values)  # Train the model using the training data

GradientBoostingRegressor()

In [130]:
accuracy = model.score(X_test.values, y_test.values)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8887091742635024


## Result
Here is the single dataset to see results

In [131]:
y_pred = model.predict(X_test.values)
predicted_data = pd.DataFrame({'Predicted': y_pred})
result_data = pd.concat([test_df, predicted_data], axis=1)
# Print or further process the predicted data
display(result_data)

Unnamed: 0,week,week_day,time,mid,usage,capacity,Predicted
0,9,0,479,1,223,397,182.956967
1,9,0,479,4,1352,3893,1694.439417
2,9,0,479,2,271,697,358.159902
3,9,0,479,3,655,1930,755.271136
4,9,0,479,7,686,2019,774.846617
...,...,...,...,...,...,...,...
2315,10,5,990,2,548,697,483.836913
2316,10,5,990,8,1193,2947,1078.897161
2317,10,5,990,7,1354,2019,1183.791473
2318,10,5,990,6,1680,3113,1014.927181
