In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
y = train['loss']
train.drop(['id','loss'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)

In [None]:
not_features = ['id', 'loss']
features = []
for feat in train.columns:
    if feat not in not_features:
        features.append(feat)

In [None]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
X = train

In [None]:
lda = LDA(n_components=42, solver='svd')
X_lda = lda.fit_transform(X, y)

EVR = lda.explained_variance_ratio_
for idx, R in enumerate(EVR):
    print("Component {}: {}% var".format(idx+1, np.round(R*100,2)))

In [None]:
def objective(trial):
    lda = LDA(n_components=40, solver='svd')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    X_train = lda.fit_transform(X_train, y_train)
    X_test = lda.fit_transform(X_test, y_test)
    param = {
        'max_depth': trial.suggest_int('max_depth', 6, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 400, 7000, 200), 
        'eta': trial.suggest_float('eta', 0.007, 0.013),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
    }
    model = XGBRegressor(**param , tree_method='gpu_hist')
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False) 
    y_preds = model.predict(X_test)
    loss = mean_squared_error(y_test, y_preds)
    return loss

In [None]:
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25 , show_progress_bar=True)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
study.best_trial.params

In [None]:
from catboost import CatBoostRegressor

In [None]:
def objective_cat(trial):
    lda = LDA(n_components=40, solver='svd')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    X_train = lda.fit_transform(X_train, y_train)
    X_test = lda.fit_transform(X_test, y_test)
    param = {
        'iterations': trial.suggest_int('iterations', 400, 7000, 200),
        'learning_rate':  trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "subsample": trial.suggest_float('subsample', 0.5, 0.95),
        "depth": trial.suggest_int("depth", 1, 15),
        'loss_function': 'RMSE',
    }
    model = CatBoostRegressor(**param)
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False) 
    y_preds = model.predict(X_test)
    loss = mean_squared_error(y_test, y_preds)
    return loss

In [None]:
OPTUNA_OPTIMIZATION = True
study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective_cat, n_trials=25 , show_progress_bar=True)
print('Number of finished trials:', len(study_cat.trials))
print('Best trial: score {}, params {}'.format(study_cat.best_trial.value, study_cat.best_trial.params))

In [None]:
study_cat.best_trial.params

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
def objective_sgd(trial):
    lda = LDA(n_components=40, solver='svd')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    X_train = lda.fit_transform(X_train, y_train)
    X_test = lda.fit_transform(X_test, y_test)
    param = {
        'eta0' : trial.suggest_float('eta0', 0.001, 0.013),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 1),
        'max_iter' : trial.suggest_int('max_iter', 400, 7000, 200),  
    }
    model = SGDRegressor(**param)
    model.fit(X_train,y_train) 
    y_preds = model.predict(X_test)
    loss = mean_squared_error(y_test, y_preds)
    return loss

In [None]:
OPTUNA_OPTIMIZATION = True
study_sgd = optuna.create_study(direction="minimize")
study_sgd.optimize(objective_sgd, n_trials=25 , show_progress_bar=True)
print('Number of finished trials:', len(study_sgd.trials))
print('Best trial: score {}, params {}'.format(study_sgd.best_trial.value, study_sgd.best_trial.params))

In [None]:
study_sgd.best_trial.params

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Line

In [None]:
xgb_params = study.best_trial.params
xgb_params['tree_method'] = 'gpu_hist'
cat_params = study_cat.best_trial.params
sgd_params = study_sgd.best_trial.params
test_preds=None

print("\033[93mTraining........")

kf = StratifiedKFold(n_splits = 10 , shuffle = True , random_state = 42)
for fold, (tr_index , val_index) in enumerate(kf.split(X.values , y.values)):
    
    print("⁙" * 10)
    print(f"Fold {fold + 1}")
    
    X_train,X_val = X.values[tr_index] , X.values[val_index]
    y_train,y_val = y.values[tr_index] , y.values[val_index]
        
    eval_set = [(X_val, y_val)]
    xgb = XGBRegressor(**xgb_params)
    cat = CatBoostRegressor(**cat_params)
    sgd = SGDRegressor(**sgd_params)
    model = StackingRegressor( estimators = [('cb', cat),('xgb', xgb),('sgd' , sgd)],
                              final_estimator =ElasticNetCV(),
                              cv = 5,
                              n_jobs=-1)
    model.fit(X_train, y_train)
    train_preds = model.predict(X_train)    
    val_preds = model.predict(X_val)
    print(np.sqrt(mean_squared_error(y_val, val_preds)))
    
    if test_preds is None:
        test_preds = model.predict(test.values)
    else:
        test_preds += model.predict(test.values)

print("-" * 50)
print("\033[95mTraining Done")

test_preds /= 10

In [None]:
test_preds

In [None]:
submission['loss']=test_preds

In [None]:
submission

In [None]:
submission.to_csv('submission_xgb_7.csv', index=False)