In [None]:
#%%capture
#!pip install pycaret[full]

In [None]:
# Intel® Extension for Scikit-learn installation:
!pip install scikit-learn-intelex

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import time
import os
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold, train_test_split, cross_val_score
from sklearn.metrics import log_loss, accuracy_score, mean_absolute_error, r2_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import mode
import lightgbm as lgb

from xgboost import XGBClassifier, XGBRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# **Import**

In [None]:
filepath = "/kaggle/input/tabular-playground-series-mar-2022/"
train_df = pd.read_csv(filepath+"train.csv", index_col = 'row_id').pipe(reduce_mem_usage)
train_df["time"] = pd.to_datetime(train_df["time"])
test_df = pd.read_csv(filepath+"test.csv", index_col = 'row_id').pipe(reduce_mem_usage)
test_df["time"] = pd.to_datetime(test_df["time"])

In [None]:
train_df.describe()

# **Pre-Processing**

In [None]:
def new_date_features(df):
    # Date features
    df['year'] = df.time.dt.year 
    df['quarter'] = df.time.dt.quarter
    df['month'] = df.time.dt.month  
    df['week'] = df.time.dt.week 
    df['day'] = df.time.dt.day  
    df["hour"] = df.time.dt.hour
    df["minute"] = df.time.dt.minute
    df['weekday'] = df.time.dt.weekday
    df['day_of_year'] = df.time.dt.dayofyear  
#     df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_month'] = df.time.dt.days_in_month  
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    df['is_friday'] = np.where((df['weekday'] == 4), 1, 0)
    # Time feature - Credits to https://www.kaggle.com/tariqchhussain/xgboost-optuna-k-fold-cv-tps-march-22
    df['is_morning'] = np.where(6 <= df['hour'] & (df['hour'] < 12), 1, 0)
    df['is_afternoon'] = np.where(12 <= df['hour'] & (df['hour'] < 18), 1, 0)
    df['is_evening'] = np.where(18 <= df['hour'] & (df['hour'] < 23), 1, 0)
    df['is_night'] = np.where(0 <= df['hour'] & (df['hour'] < 6), 1, 0)
    df.drop('time', axis=1, inplace=True)
    
    return df

In [None]:
def categorical_features(df) :
    """
    Create one colonne per direction
    """
    #unique_value = df.direction.unique()
    #for value in unique_value :
    #    df['is_'+str(value)] = mnp.where((df['direction'] == str(value)), 1, 0) # one colonne for each direction
    
    #df.drop('direction', axis=1, inplace=True)
        
    return df


In [None]:
# Add Date features
#train_df = categorical_features(train_df)
#test_df  = categorical_features(test_df)

In [None]:
# Add categorical features
train_df = new_date_features(train_df)
test_df  = new_date_features(test_df)

In [None]:
train_df.head()

In [None]:
# Features columns
features = [e for e in train_df if e != "row_id" and e != "congestion"]
le = LabelEncoder()
train_df["direction"] = le.fit_transform(train_df["direction"])
test_df["direction"] = le.fit_transform(test_df["direction"])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
#  Target columns
target = "congestion"

In [None]:
# Encoding
X = train_df[features]
y = train_df[target]

# **MODEL-XGBOOST** - Credit to @tariqchhussain
https://www.kaggle.com/tariqchhussain/xgboost-optuna-k-fold-cv-tps-march-22

In [None]:
def get_models():
    models = {}
    
    models['XGBRegressor'] = XGBRegressor(**xgb_params,
                             gpu_id = -1)
    return models

In [None]:
## Best parameters
xgb_params = {'objectif' : 'regsquarerror',
             'eval_metric': 'mae',
              'n_estimators': 40000,
              'learning_rate': 0.15439200843275436,
             'subsample': 0.96,
             'colsample_bytree': 0.9700000000000001,
             'max_depth': 8,
             'booster': 'gbtree',
             'gamma': 10.8,
             'reg_lambda': 0.25369404126606065,
             'reg_alpha': 0.1488746063507415,
             'random_state': 42,
             'n_jobs': 4,
             'min_child_weight': 256}

In [None]:
xtrain,xvalid,ytrain,yvalid = train_test_split(X,y,test_size=0.2,random_state=21)

In [None]:
test_df

In [None]:
from sklearn.model_selection import TimeSeriesSplit

folds = TimeSeriesSplit(10)

preds = np.zeros(len(test_df))
scores = []

print(" ** START OFF PREDICTIONS for base models ** ")
for fold, (idx_train, idx_valid) in enumerate(folds.split(X, y)):
    print(f" FOLD : {fold+1}")
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    print("X_train shape :",X_train.shape, "y_train shape :",y_train.shape, "X_valid shape :",X_valid.shape, "y_valid shape :",y_valid.shape )
    models = get_models()
    for name, model in models.items():
        #print(name, model)
        model.fit(
            X_train,
            y_train,
            eval_metric='mae',
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

        predict_valid = model.predict(X_valid)
        score = mean_absolute_error(y_valid, predict_valid)
        scores.append(score)
        print(f"{name} : %.5f" % score)
        print('-'*40)
        preds += model.predict(test_df) / folds.n_splits

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
sub[target] = preds.round(decimals=0)
sub.to_csv('submission.csv', index=False)