In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv',  index_col = 'row_id')
test =  pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv' ,  index_col = 'row_id')
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

In [None]:
train.head()

## Preprocessing  time 

In [None]:
import holidays

def create_time_features(df, time_col):
    df[time_col] = pd.to_datetime(df[time_col])
    df['week']= df[time_col].dt.week
    df['year'] = df[time_col].dt.year
    df['quarter'] =df[time_col].dt.quarter
    df['day'] = df[time_col].dt.day
    df['dayofyear'] = df[time_col].dt.dayofyear
    df['weekend'] = df[time_col].dt.weekday >=5
    df['weekday'] =df[time_col].dt.weekday
    df['month']=df[time_col].dt.month
    df['hour']=df[time_col].dt.hour
    df['minute']=df[time_col].dt.minute
    df['is_month_start'] = df['time'].dt.is_month_start.astype('int')
    df['is_month_end'] = df['time'].dt.is_month_end.astype('int')
    df['is_afternoon'] = (df['time'].dt.hour > 12).astype('int')
    
    holidays_list = holidays.US(years=df[time_col].dt.year.values)
    df['holiday'] = 0
    return df

In [None]:
create_time_features(train, 'time')
create_time_features(test, 'time')

In [None]:
def calculate_distances(df):
    df['dist_to_00'] = np.sqrt(np.square(df['x'] - 0) + np.square(df['y'] - 0))
    df['dist_to_01'] = np.sqrt(np.square(df['x'] - 0) + np.square(df['y'] - 1))
    df['dist_to_02'] = np.sqrt(np.square(df['x'] - 0) + np.square(df['y'] - 2))
    df['dist_to_03'] = np.sqrt(np.square(df['x'] - 0) + np.square(df['y'] - 3))
    df['dist_to_10'] = np.sqrt(np.square(df['x'] - 1) + np.square(df['y'] - 0))
    df['dist_to_11'] = np.sqrt(np.square(df['x'] - 1) + np.square(df['y'] - 1))
    df['dist_to_12'] = np.sqrt(np.square(df['x'] - 1) + np.square(df['y'] - 2))
    df['dist_to_13'] = np.sqrt(np.square(df['x'] - 1) + np.square(df['y'] - 3))
    df['dist_to_20'] = np.sqrt(np.square(df['x'] - 2) + np.square(df['y'] - 0))
    df['dist_to_21'] = np.sqrt(np.square(df['x'] - 2) + np.square(df['y'] - 1)) 
    df['dist_to_22'] = np.sqrt(np.square(df['x'] - 2) + np.square(df['y'] - 2))
    df['dist_to_23'] = np.sqrt(np.square(df['x'] - 2) + np.square(df['y'] - 3))    
    return df

In [None]:
calculate_distances(train)
calculate_distances(test)

In [None]:
dir_mapper = {'EB': [1,0], 
              'NB': [0,1], 
              'SB': [0,-1], 
              'WB': [-1,0], 
              'NE': [1,1], 
              'SW': [-1,-1], 
              'NW': [-1,1], 
              'SE': [1,-1]}

In [None]:
def feature_engineering(df):
    df['converted_direction_coord_0'] = df['direction'].map(lambda x: dir_mapper[x][0])
    df['converted_direction_coord_1'] = df['direction'].map(lambda x: dir_mapper[x][1])
    df['hour+minute'] = df['time'].dt.hour * 60 + df['time'].dt.minute
    df['x+y'] = df['x'].astype('str') + df['y'].astype('str')
    df['x+y+direction'] = df['x'].astype('str') + df['y'].astype('str') + df['direction'].astype('str')
    df['x+y+direction_0'] = df['x'].astype('str') + df['y'].astype('str') + df['converted_direction_coord_0'].astype('str')
    df['x+y+direction_1'] = df['x'].astype('str') + df['y'].astype('str') + df['converted_direction_coord_1'].astype('str')
    df['hour+direction'] = df['hour'].astype('str') + df['direction'].astype('str')
    df['hour+x+y'] = df['hour'].astype('str') + df['x'].astype('str') + df['y'].astype('str')
    df['hour+direction+x'] = df['hour'].astype('str') + df['direction'].astype('str') + df['x'].astype('str')
    df['hour+direction+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['y'].astype('str')
    df['hour+direction+x+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['x'].astype('str') + df['y'].astype('str')
    df['hour+x'] = df['hour'].astype('str') + df['x'].astype('str')
    df['hour+y'] = df['hour'].astype('str') + df['y'].astype('str')
    df['road'] = df['x'].astype(str) + df['y'].astype(str) + df['direction']
    df['moment']  = df['time'].dt.hour * 6 + df['time'].dt.minute // 10
    return df

In [None]:
feature_engineering(train)
feature_engineering(test)

In [None]:
from math import  pi
sin_vals = {
    'NB': 0.0,
    'NE': np.sin(1 * pi/4),
    'EB': 1.0,
    'SE': np.sin(3 * pi/4),
    'SB': 0.0,
    'SW': np.sin(5 * pi/4),    
    'WB': -1.0,    
    'NW': np.sin(7 * pi/4),  
}

cos_vals = {
    'NB': 1.0,
    'NE': np.cos(1 * pi/4),
    'EB': 0.0,
    'SE': np.cos(3 * pi/4),
    'SB': -1.0,
    'SW': np.cos(5 * pi/4),    
    'WB': 0.0,    
    'NW': np.cos(7 * pi/4),  
}


train['sin'] = train['direction'].map(sin_vals)
test['sin'] = test['direction'].map(sin_vals)

train['cos'] = train['direction'].map(cos_vals)
test['cos'] = test['direction'].map(cos_vals)

In [None]:
all_df = pd.concat([train, test])

In [None]:
all_df

In [None]:
medians = pd.DataFrame(all_df.groupby(['road', 'weekday', 'moment']).congestion.median().astype(int)).reset_index()
medians = medians.rename(columns={'congestion':'median'})
all_df = all_df.merge(medians, on=['road', 'weekday', 'moment'], how='left')

In [None]:
maxs = pd.DataFrame(all_df.groupby(['road', 'weekday', 'moment']).congestion.max().astype(int)).reset_index()
maxs = maxs.rename(columns={'congestion':'max'})
all_df = all_df.merge(maxs, on=['road', 'weekday', 'moment'], how='left')

In [None]:
mins = pd.DataFrame(all_df.groupby(['road', 'weekday', 'moment']).congestion.min().astype(int)).reset_index()
mins = mins.rename(columns={'congestion':'min'})
all_df = all_df.merge(mins, on=['road', 'weekday', 'moment'], how='left')

In [None]:
train = all_df[:len(train)]
test = all_df[-len(test):]

In [None]:
train.info()

In [None]:
from sklearn.preprocessing import LabelEncoder 

def encode_categorical(train_df, test_df, categ_feat = ['direction']):
    '''
    
    '''
    encoder_dict = {}
    
    concat_data = pd.concat([train[categ_feat], test[categ_feat]])
    
    for col in concat_data.columns:
        print('Encoding: ', col, '...')
        encoder = LabelEncoder()
        encoder.fit(concat_data[col])
        encoder_dict[col] = encoder

        train_df[col + '_enc'] = encoder.transform(train_df[col])
        test_df[col + '_enc'] = encoder.transform(test_df[col])
    
    train_df = train_df.drop(columns = categ_feat, axis = 1)
    test_df = test_df.drop(columns = categ_feat, axis = 1)

    return train_df, test_df

In [None]:
categorical_features = ['direction', 'x+y', 'x+y+direction', 
                        'x+y+direction_0', 'x+y+direction_1', 'hour+direction', 'hour+x+y', 'hour+direction+x', 'hour+direction+y', 'hour+direction+x+y',
                        'hour+x', 'hour+y' , 'road']

train, test = encode_categorical(train, test, categorical_features)

In [None]:
ignore = ['row_id', 'time', 'congestion', 'direction', 'year']
features = [feat for feat in train.columns if feat not in ignore]
target_feature = 'congestion'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics  import mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import HuberRegressor
import optuna

In [None]:
X_train , X_test , y_train , y_test = train_test_split(train[features], train[target_feature], test_size = 0.2, random_state = 42)

In [None]:
model =  HuberRegressor()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
model =  CatBoostRegressor(task_type = 'GPU' , n_estimators = 4500 , verbose = False , eval_metric='MAE', loss_function='MAE')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
rms1,rms2 = [],[]
models1,models2 , models3 = [], [] , []
scores  = []
folds = StratifiedKFold( n_splits= 10, shuffle=True , random_state = 42)
for fold, (trn_id, val_id) in enumerate(folds.split(train[features], train[target_feature])):  
    X_train, y_train = train[features].iloc[trn_id], train[target_feature].iloc[trn_id]
    X_test, y_test = train[features].iloc[val_id], train[target_feature].iloc[val_id]
    lgb = LGBMRegressor(n_estimators = 500, device ='gpu')
    lgb.fit(X_train , y_train)
    hub = HuberRegressor()
    hub.fit(X_train , y_train)
    ct =  CatBoostRegressor(task_type = 'GPU' , verbose = False , eval_metric='MAE', loss_function='MAE', n_estimators= 4500)
    ct.fit(X_train , y_train)
    xgb = XGBClassifier(tree_method = 'gpu_hist' , n_estimators = 4500)
    xgb.fit(X_train,y_train)
    models1.append(hub)
    models2.append(ct)
    models3.append(lgb)
    valid_pred = hub.predict(X_test)
    valid_pred_lgb = lgb.predict(X_test)
    valid_pred_ct = ct.predict(X_test)
    score = mean_absolute_error(y_test, valid_pred)
    score_ct = mean_absolute_error(y_test, valid_pred_ct)
    score_lgb = mean_absolute_error(y_test, valid_pred_lgb)
    print("Fold:", fold  + 1, "MAE HUB: ", score ,  "MAE CAT: " , score_ct , "MAE LGB: ", score_lgb)

In [None]:
models1

In [None]:
preds = []
for model in models1:
    pred = model.predict(test[features])
    preds.append(pred)
model1_pred = np.mean(preds, axis=0)
preds = []
for model in models2:
    pred = model.predict(test[features])
    preds.append(pred)
model2_pred = np.mean(preds, axis=0)
for model in models3:
    pred = model.predict(test[features])
    preds.append(pred)
model3_pred = np.mean(preds, axis=0)

In [None]:
model1_pred

In [None]:
model2_pred

In [None]:
pred = model.predict(test[features])
submission['congestion'] = pred
submission['congestion'] = submission['congestion'].round().astype(int)
assert (submission['congestion'] >= 0).all()
assert (submission['congestion'] <= 100).all()

In [None]:
submission['congestion'] = 0.2*model1_pred + 0.2 * model2_pred + 0.6 * model3_pred
submission['congestion'] = submission['congestion'].round().astype(int)
assert (submission['congestion'] >= 0).all()
assert (submission['congestion'] <= 100).all()

In [None]:
submission

In [None]:
submission.to_csv('sub2.csv',index=False)