In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor

In [2]:
df_train = pd.read_csv('dataset/train_feature_selected.csv')
df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])

df_test = pd.read_csv('dataset/test_feature_selected.csv')
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

### Split df_train into train data and validation data

In [3]:
new_id_list = df_train.item_id.unique()

In [4]:
df_train_splitted = pd.DataFrame()
df_valid_splitted = pd.DataFrame()

for i in new_id_list:
    trial = df_train.loc[df_train['item_id'] == i]
    trial['index_column'] = trial.reset_index().index
    number_of_row = trial.shape[0]
    split_point = round(0.8*number_of_row)
    trial_train = trial[trial['index_column'] < split_point]
    df_train_splitted = pd.concat([df_train_splitted, trial_train], ignore_index=True)
    trial_valid = trial[trial['index_column'] >= split_point]
    df_valid_splitted = pd.concat([df_valid_splitted, trial_valid], ignore_index=True)
    
#df_train_splitted = df_train.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial['index_column'] = trial.reset_index().index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial['index_column'] = trial.reset_index().index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial['index_column'] = trial.reset_index().index
A value is trying to be set on a copy of a slice from a

In [5]:
df_train_splitted.shape

(394087, 9)

## Modeling

In [6]:
all_time_series = df_train_splitted['item_id'].unique()

all_results = []
final_model = {}

for i in tqdm(all_time_series):
    df_subset = df_train_splitted.loc[df_train_splitted['item_id'] == i]

    X = df_subset.drop(['rerata_kecepatan','timestamp','index_column'],axis = 1)
    y = df_subset['rerata_kecepatan']

    # Create a RandomForestRegressor model
    model = RandomForestRegressor()

    # Fit the model on the subset data
    model.fit(X, y)

    # Store results in all_results
    p = df_subset.iloc[0:1].copy()
    p['item_id'] = str(i)
    all_results.append(p)

    final_model[i] = model

100%|███████████████████████████████████████████████████████████████████████| 934/934 [02:05<00:00,  7.42it/s]


In [7]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

Unnamed: 0,item_id,timestamp,rerata_kecepatan,mean_rerata_kecepatan_mingguan,mean_rerata_kecepatan_harian,hour,2_per_3_maxspeed,highway,index_column
0,691007296_21390008_1425033102,2020-02-01 01:00:00,29.126,29.165,33.901227,1,19.98,1,0
422,47010584_1677092762_579493410,2020-02-01 01:00:00,46.576,44.204,43.751909,1,19.98,2,0
844,22932408_26486694_1930267566,2020-02-01 01:00:00,36.587,37.125,37.7365,1,13.32,0,0
1266,142479648_1111592522_3775231113,2020-02-01 01:00:00,34.063,33.89575,36.596682,1,13.32,2,0
1688,8504977_5940503398_5940503394,2020-02-01 01:00:00,38.336,39.62725,39.681182,1,19.98,2,0


### Predict Validation Data

In [8]:
all_valid_time_series = df_valid_splitted['item_id'].unique()

all_result_valid = []


for i in tqdm(all_valid_time_series):
    df_subset = df_valid_splitted.loc[df_valid_splitted['item_id'] == i]
    df_subset.head()

    X = df_subset.drop(['rerata_kecepatan','timestamp','index_column'],axis = 1)

    # Predict using the loaded model
    model = final_model[i]
    predictions = model.predict(X)

    # Store results in all_results
    #p = predictions.iloc[0:1].copy()
    p = pd.DataFrame()
    p['predictions'] = predictions
    timestamp = np.array(df_subset['timestamp'])
    p['timestamp'] = pd.to_datetime(timestamp)
    p['item_id'] = str(i)
    #p['predictions'] = predictions
    all_result_valid.append(p)


100%|███████████████████████████████████████████████████████████████████████| 934/934 [00:12<00:00, 75.42it/s]


In [9]:
# Concatenate all individual DataFrames into one DataFrame
concat_result_valid = pd.concat(all_result_valid, axis=0)

In [10]:
#define sMAPE

def smape(actual, forecast):
    return 100 * np.mean(2 * np.abs(actual - forecast) / (np.abs(actual) + np.abs(forecast)))

In [11]:
score_validation = smape(df_valid_splitted['rerata_kecepatan'],concat_result_valid['predictions'])
score_validation

41.046202519624195

### Predict Test Data

In [12]:
all_test_time_series = df_test['item_id'].unique()

all_result_test = []


for i in tqdm(all_test_time_series):
    df_subset = df_test.loc[df_test['item_id'] == i]
    df_subset.head()

    X = df_subset.drop(['timestamp'],axis = 1)

    # Predict using the loaded model
    model = final_model[i]
    predictions = model.predict(X)

    # Store results in all_results
    #p = predictions.iloc[0:1].copy()
    p = pd.DataFrame()
    p['predictions'] = predictions
    timestamp = np.array(df_subset['timestamp'])
    p['timestamp'] = pd.to_datetime(timestamp)
    p['item_id'] = str(i)
    #p['predictions'] = predictions
    all_result_test.append(p)


100%|███████████████████████████████████████████████████████████████████████| 934/934 [00:12<00:00, 76.44it/s]


In [14]:
# Concatenate all individual DataFrames into one DataFrame
concat_result_test = pd.concat(all_result_test, axis=0)


### Create Submission file

In [16]:
submission.head()

Unnamed: 0,id,timestamp,lanes,maxspeed,highway,item_id,rerata_kecepatan
0,0,2020-02-23,2.0,30,1,4004732_32046542_6454026544,0
1,1,2020-02-23,3.0,30,1,182210371_1314925464_1314925496,0
2,2,2020-02-23,1.0,20,0,22932408_1482086782_26481020,0
3,3,2020-02-23,3.0,30,1,182210371_3892883_267337489,0
4,4,2020-02-23,1.0,30,1,66924592_266041030_2592978110,0


In [18]:
submission = pd.read_csv('dataset/test_preprocessed.csv')
submission['timestamp'] = pd.to_datetime(df_test['timestamp'])

submission['rerata_kecepatan'] = 0

for i in range(len(submission)):
    time = submission.loc[i, 'timestamp']
    series_id = submission.loc[i, 'item_id']
    
    condition = (concat_result_test['item_id'] == series_id) & (concat_result_test['timestamp'] == time)
    matching_rows = concat_result_test.loc[condition, 'predictions']
    
    if not matching_rows.empty:
        result = matching_rows.iloc[0]  # Assuming only one matching row
        submission.loc[i, 'rerata_kecepatan'] = result
    else:
        submission.loc[i, 'rerata_kecepatan'] = None  # No match found

print(submission)

            id           timestamp  lanes  maxspeed  highway  \
0            0 2020-02-23 00:00:00    2.0        30        1   
1            1 2020-02-23 00:00:00    3.0        30        1   
2            2 2020-02-23 00:00:00    1.0        20        0   
3            3 2020-02-23 00:00:00    3.0        30        1   
4            4 2020-02-23 00:00:00    1.0        30        1   
...        ...                 ...    ...       ...      ...   
127484  127484 2020-02-29 23:00:00    2.0        30        2   
127485  127485 2020-02-29 23:00:00    1.0        20        2   
127486  127486 2020-02-29 23:00:00    1.0        30        2   
127487  127487 2020-02-29 23:00:00    1.0        20        0   
127488  127488 2020-02-29 23:00:00    3.0        30        1   

                                item_id  rerata_kecepatan  
0           4004732_32046542_6454026544         43.144380  
1       182210371_1314925464_1314925496         38.225021  
2          22932408_1482086782_26481020         37.

In [19]:
submission = submission[['id','rerata_kecepatan']]

In [20]:
submission.head()

Unnamed: 0,id,rerata_kecepatan
0,0,43.14438
1,1,38.225021
2,2,37.19423
3,3,44.563099
4,4,27.713721


In [21]:
submission.to_csv(r'submission/submission_new_1.csv', index=False)