In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold,cross_val_score,train_test_split

from catboost import CatBoostRegressor, Pool

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")
sample = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")

In [3]:
train

In [4]:
def create_temporal_feat(df,date_name):
    
    df[date_name] = pd.to_datetime(df[date_name], format = '%Y-%m-%d %H:%M:%S')
    df["year"] = df[date_name].dt.year
    df["month"] = df[date_name].dt.month
    df["week"] = df[date_name].dt.isocalendar().week
    df["day"] = df[date_name].dt.day
    df["week_day"] = df[date_name].dt.weekday
    df["hours"] = df[date_name].dt.hour
    df['minute'] = df[date_name].dt.minute
    
    df["year_month"] = df["year"].astype("str") + df['month'].astype(str).str.zfill(2)
    display(df)
    return df

In [5]:
train = create_temporal_feat(train,"time")
test = create_temporal_feat(test,"time")

In [6]:
train["road"] = train['x'].astype(str) + train['y'].astype(str) + train['direction']
test["road"] = test['x'].astype(str) + test['y'].astype(str) + test['direction']

In [7]:
train

In [8]:
train.road.value_counts()

In [9]:
road_test = train[train["road"] == "01NB"]

In [10]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x = road_test.time, y = road_test.congestion,name = "congestion"))

fig1

In [11]:
fi = px.box(train[train.congestion.isnull() == False], x="road", y="congestion",
            color = 'road')
fi

In [12]:
per_road = train.groupby("road")["congestion"].agg(mean_conges = "mean", median_conges = "median").reset_index()

fig_road = px.bar(per_road, y= "mean_conges", x='road',
            title= "mean congestion per road")
fig_road.show()

In [13]:
per_dir = train.groupby("direction")["congestion"].agg(mean_conges = "mean", median_conges = "median").reset_index()

fig_dir = px.bar(per_dir, y= "mean_conges", x='direction',
            title= "mean congestion per direction")
fig_dir.show()

In [14]:
per_day = train.groupby("week_day")["congestion"].agg(mean_conges = "mean", median_conges = "median").reset_index()

fig_day = px.bar(per_day, y= "mean_conges", x='week_day',
            title= "mean congestion per day")
fig_day.show()

In [15]:
per_week = train.groupby("week")["congestion"].agg(mean_conges = "mean", median_conges = "median").reset_index()

fig_week = px.bar(per_week, y= "mean_conges", x='week',
            title= "mean congestion per week")
fig_week.show()

In [16]:
train

In [17]:
per_hour = train.groupby("hours")["congestion"].agg(mean_conges = "mean", median_conges = "median").reset_index()

fig_hour = px.bar(per_hour, y= "mean_conges", x='hours',
            title= "mean congestion per hours")
fig_hour.show()

In [18]:
train.time

In [19]:
keys = ['road', 'week_day','hours', 'minute']


df = train.groupby(by=keys).mean().reset_index().set_index(keys)
df['mean congestion'] = df['congestion']
train = train.merge(df['mean congestion'], how='left', left_on=keys, right_on=keys)
test = test.merge(df['mean congestion'], how='left', left_on=keys, right_on=keys)

df = train.groupby(by=keys).median().reset_index().set_index(keys)
df['median congestion'] = df['congestion']
train = train.merge(df['median congestion'], how='left', left_on=keys, right_on=keys)
test = test.merge(df['median congestion'], how='left', left_on=keys, right_on=keys)

df = train.groupby(by=keys).min().reset_index().set_index(keys)
df['min congestion'] = df['congestion']
train = train.merge(df['min congestion'], how='left', left_on=keys, right_on=keys)
test = test.merge(df['min congestion'], how='left', left_on=keys, right_on=keys)

df = train.groupby(by=keys).max().reset_index().set_index(keys)
df['max congestion'] = df['congestion']
train = train.merge(df['max congestion'], how='left', left_on=keys, right_on=keys)
test = test.merge(df['max congestion'], how='left', left_on=keys, right_on=keys)

display(train)

In [20]:
start_valid_date = "1991-09-01 11:40:00"

valid = train[train["time"] > start_valid_date]
train = train[train["time"] <= start_valid_date]

print(train.shape,valid.shape)

In [21]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
cols = ["road","direction"]

road_encoder = LabelEncoder()


train[cols] = train[cols].apply(road_encoder.fit_transform)
valid[cols] = valid[cols].apply(road_encoder.fit_transform)
test[cols] = test[cols].apply(road_encoder.fit_transform)


In [22]:
train

In [23]:
scaler = MinMaxScaler()
subset = ["minute","max congestion","min congestion","median congestion","mean congestion"]

"""
train.loc[:,subset] = scaler.fit_transform(train.loc[:,subset])
valid.loc[:,subset] = scaler.fit_transform(valid.loc[:,subset])
test.loc[:,subset] = scaler.fit_transform(test.loc[:,subset])
"""

In [24]:
valid

## Modeling

In [25]:
y_train = train.congestion
y_valid = valid.congestion

X_train = train.drop(["congestion","row_id","x","y",
                      "year_month","week"],axis = 1).set_index("time")
X_valid = valid.drop(["congestion","row_id","x","y",
                      "year_month","week"],axis = 1).set_index("time")
X_test = test.drop(["row_id","x","y",
                      "year_month","week"],axis = 1).set_index("time")

## Xgboost

In [26]:
import xgboost as xgb


reg = xgb.XGBRegressor(n_estimators=500,
                    learning_rate = 0.01,
                      max_depth = 8)

reg.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric="rmse",
          early_stopping_rounds=50,
          verbose=50
       )

valid_pred = reg.predict(X_valid)

In [27]:
valid_pred = pd.DataFrame(reg.predict(X_valid),columns = ["pred"], index = X_valid.index)
print(mse(y_valid,valid_pred))

In [43]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize

n_features = X_train.shape[1]

xg = xgb.XGBRegressor(
            objective='reg:squarederror',
            N_estimator = 50,
            learning_rate = 0.5,
            subsample  = 0.8,
            eval_metric = 'rmse',
            )


space  = [Integer(2, 4, name='max_depth'),
          Integer(1, 4 ,name = "reg_lambda"),
          Integer(1, n_features, name='max_features'),
          Integer(2, 50, name='min_samples_split'),
          Integer(1, 50, name='min_samples_leaf')
         ]


In [44]:
space

In [45]:
def baysian_opti(space,model):
    
    @use_named_args(space)
    def objective(**params):
        return -np.mean(cross_val_score(model,X_valid,y_valid,cv = 5,n_jobs = -1,
                                       scoring = "neg_mean_squared_error"))
    
    res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)
    print("Best score=%.4f" % res_gp.fun)
    
    






In [None]:
best_param = baysian_opti(space,xg)

In [None]:
reg = xg = xgb.XGBRegressor(
            objective='reg:squarederror',
            N_estimator = 50,
            learning_rate = 0.5,
            subsample  = 0.8,
            eval_metric = 'rmse',
            )

In [None]:
test_pred = reg.predict(test.drop(["row_id","x","y",
                      "year_month","week"],axis = 1).set_index("time"))

In [None]:
submit = sample.copy()
submit["congestion"] = test_pred
submit.to_csv("submission.csv", index = False)

## Catboost

In [None]:
cat_feat = ["week_day","hours","month"]
cat_num = X_train.columns.get_indexer(cat_feat)
train_pool = Pool(X_train,y_train,cat_features = cat_num)
valid_pool =  Pool(X_valid,y_valid,cat_features = cat_num)


CatB = CatBoostRegressor(
            iterations = 2000,
            learning_rate = 0.005,
            max_depth = 6,
#            subsample = 0.8,
            loss_function='RMSE',
#            min_child_samples = 2,
            one_hot_max_size = 6,
#            langevin = False,
            )

CatB.fit(train_pool,
          eval_set=valid_pool,
          early_stopping_rounds=50,
          verbose=50
       )

In [None]:
cat_num = X_test.columns.get_indexer(cat_feat)
test_pool = Pool(X_test, cat_features = cat_num)
test_pred = CatB.predict(test_pool)

In [None]:
submit = sample.copy()
submit["congestion"] = test_pred
submit.to_csv("submission.csv", index = False)

## On model per road

In [None]:
from tqdm import tqdm 

In [None]:
col_to_drop = ["row_id","congestion","x","y","direction",
                      "year_month","week","year"]
def create_data(train,valid,test,road_name):
    
    train_inter = train[train["road"] == road_name]
    valid_inter = valid[valid["road"] == road_name]
    test_inter = test[test["road"] == road_name]
    
    id_train = train_inter.row_id
    id_valid = valid_inter.row_id
    id_test = test_inter.row_id
    
    y_train = train_inter.congestion
    y_valid = valid_inter.congestion
    
    X_train = train_inter.drop(col_to_drop,axis = 1).set_index("time")
    X_valid = valid_inter.drop(col_to_drop,axis = 1).set_index("time")
    X_test = test_inter.drop(["row_id","x","y","direction",
                      "year_month","week","year"],axis = 1).set_index("time")
    
    output = [X_train,y_train,id_train,X_valid,y_valid,id_valid,X_test,id_test]
    return output


In [None]:
road_list = list(train.drop_duplicates("road").road.values)
prediction = pd.DataFrame(columns = ["row_id","congestion"])
MSE = []

for road in tqdm(road_list):
    
    [X_train,y_train,id_train,X_valid,y_valid,id_valid,X_test,id_test] = create_data(train,valid,test,road)

    reg = xgb.XGBRegressor(n_estimators=500,
                    learning_rate = 0.01)
    
    reg.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric="rmse",
          early_stopping_rounds=50,
          verbose=0
       )
    
    
    MSE.append(mse(y_valid,reg.predict(X_valid)))
    prediction_road = pd.DataFrame({"row_id":id_test,"congestion":reg.predict(X_test)})
    prediction = pd.concat([prediction, prediction_road], axis = 0)


In [None]:
prediction = prediction.sort_values("row_id")
prediction

In [None]:
#prediction.to_csv("submission.csv", index = False)

## DeepLearning

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [None]:
train1 = train[train["road"] == "01NB"]

In [None]:
seq_train = train1.congestion.values[:len(train1) - 30]
seq_train = seq_train.reshape(-1,1)
seq_valid = train1.congestion.values[len(train1) - 30:]
seq_valid = seq_valid.reshape(-1,1)


from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
seq_train = sc.fit_transform(seq_train)
seq_valid = sc.fit_transform(seq_valid)

In [None]:
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
            # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
n_steps = 3
X_train, y_train = split_sequence(seq_scaled, n_steps)
X_valid, y_valid = split_sequence(seq_valid, n_steps)

In [None]:
y_valid.shape

In [None]:
for i in range(len(X_valid)):
    print(X_valid[i], y_valid[i])

In [None]:
X_valid.shape

In [None]:
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))

In [None]:
regressor = Sequential()

regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (n_steps, n_features)))
regressor.add(Dropout(0.1))
regressor.add(LSTM(units = 100, return_sequences = True))
regressor.add(Dropout(0.1))

regressor.add(Dense(units = 1))



In [None]:
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [None]:
regressor.summary()

In [None]:
regressor.fit(X_train, y_train,
              validation_data = (X_valid,y_valid),
              epochs = 50,
              batch_size = 32,
             shuffle = False)

In [None]:
pred_valid = regressor.predict(X_valid)