In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset
import torch.nn.functional as F
from tqdm.notebook import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold,cross_val_score

In [2]:
train_df=pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
train_df.head(20)

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
5,5,1991-04-01 00:00:00,0,1,SB,58
6,6,1991-04-01 00:00:00,0,1,WB,26
7,7,1991-04-01 00:00:00,0,2,EB,31
8,8,1991-04-01 00:00:00,0,2,NB,49
9,9,1991-04-01 00:00:00,0,2,SB,46


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   row_id      848835 non-null  int64 
 1   time        848835 non-null  object
 2   x           848835 non-null  int64 
 3   y           848835 non-null  int64 
 4   direction   848835 non-null  object
 5   congestion  848835 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 38.9+ MB


In [4]:
train_df.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [5]:
def encode(x):
    if x=='EB':
        return 1
    elif x=='WB':
        return 2
    elif x=='SB':
        return 3
    else:
        return 4

In [6]:
def preprocess(df):
    df['time']=pd.to_datetime(df['time'])
    df['day']=df['time'].dt.day
    df['month']=df['time'].dt.month
    df['minute']=df['time'].dt.minute
    df['week']=df['time'].dt.week
    df['direction']=df['direction'].apply(encode)
    df.drop(['time','row_id'],axis=1,inplace=True)
    return df
    

In [7]:
train_df=preprocess(train_df)

  


In [8]:
X=train_df.drop('congestion',axis=1)
Y=train_df.congestion.values

In [9]:
X.head()

Unnamed: 0,x,y,direction,day,month,minute,week
0,0,0,1,1,4,0,14
1,0,0,4,1,4,0,14
2,0,0,3,1,4,0,14
3,0,1,1,1,4,0,14
4,0,1,4,1,4,0,14


In [10]:
model = XGBRegressor(n_estimators=100, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [11]:
# cv=RepeatedKFold(n_splits=5,n_repeats=3,random_state=1)
# scores = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# scores=absolute(scores)

In [12]:
model.fit(X,Y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.1, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
def mae(x,y):
    val=abs(x-y)
    return val.mean()

In [14]:
test_df=pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [15]:
test_df.head()

Unnamed: 0,row_id,time,x,y,direction
0,848835,1991-09-30 12:00:00,0,0,EB
1,848836,1991-09-30 12:00:00,0,0,NB
2,848837,1991-09-30 12:00:00,0,0,SB
3,848838,1991-09-30 12:00:00,0,1,EB
4,848839,1991-09-30 12:00:00,0,1,NB


In [16]:
test_df=preprocess(test_df)
test_df.head()

  


Unnamed: 0,x,y,direction,day,month,minute,week
0,0,0,1,30,9,0,40
1,0,0,4,30,9,0,40
2,0,0,3,30,9,0,40
3,0,1,1,30,9,0,40
4,0,1,4,30,9,0,40


In [17]:
prediction=model.predict(test_df)

In [18]:
submission=pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission['congestion']=prediction
submission.to_csv('submission.csv',index=False)
submission.head()

Unnamed: 0,row_id,congestion
0,848835,44.160351
1,848836,34.01511
2,848837,48.49773
3,848838,25.341751
4,848839,60.981831
