In [None]:
###### Basic packages
import os
from pathlib import Path
import pickle

#Data and visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Validation Strategy
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, train_test_split, KFold

#Encoding
import category_encoders as ce

#Imputation
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


#Metrics
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, mean_squared_error

#Pipeline
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from umap import UMAP
from scipy.cluster.hierarchy import dendrogram, ward

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)

In [None]:
#Regression Models

import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LassoCV
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, RidgeCV, ElasticNetCV
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

from sklearn.cross_decomposition import PLSRegression
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

In [None]:
# File path and files

#File Path
data_dir = "/kaggle/input/playground-series-s3e15"
train_file = "data.csv"
#test_file = "test.csv"
orig_file = "/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv"
submission_file = "sample_submission.csv"
working_dir = "/kaggle/working/"

target = "target"

In [None]:
# test_data = Path(data_dir)/test_file
# train_data = Path(data_dir)/train_file
# submission_data = Path(data_dir)/submission_file

In [None]:
#train = pd.read_csv("/kaggle/input/playground-series-s3e15/data.csv")
# test = pd.read_csv(test_data)
# submission_df = pd.read_csv(submission_data)

In [None]:
def get_data(data_dir = data_dir,train_file = train_file,  submission_file = submission_file):
    #test_data = Path(data_dir)/test_file
    train_data = Path(data_dir)/train_file
    orig_data = Path(orig_file)
    submission_data = Path(data_dir)/submission_file
    train = pd.read_csv(train_data)
   # test = pd.read_csv(test_data)
    orig_train = pd.read_csv(orig_data)
    submission_df = pd.read_csv(submission_data)
    return train,submission_df,orig_train
    

In [None]:
train, submission_df, orig_train = get_data()

In [None]:
train.info()

In [None]:
def create_EDA_summary (df = None):
    train_import_table = pd.DataFrame()
    train_import_table["dtype"] = df.apply(lambda x: x.dtype)
    train_import_table["NROW"] = df.shape[0]
    train_import_table["Unique_values"] = [ len(df[col].unique()) for col in df.columns]
    train_import_table["Percent_missing"] = (df.isnull().sum()/df.shape[0])*100
    
    return train_import_table

In [None]:
create_EDA_summary(df = train)

In [None]:
train.head()

In [None]:
train.rename(columns = {'x_e_out [-]':'target'}, inplace = True)

In [None]:
train.describe().T

In [None]:
#Create the test set to be predicted - Consists of all the missing values in the target variable
train["Train/Test"] = np.where(train["target"].isnull() == True, "Test", "Train")

In [None]:
train["Train/Test"].value_counts(normalize=True)

In [None]:
test = train[train["Train/Test"] == "Test"]
train = train[train["Train/Test"] == "Train"]

In [None]:
train.shape


In [None]:
#Building a basic first model

#Mean of the target variable

train = train.set_index("id")
train.head(3)

In [None]:
#Start with defining the initial imputation strategy for each feature

In [None]:
train["author"].value_counts()
#use most frequent for author

In [None]:
train["geometry"].value_counts()
#Use most frequent for geometry

In [None]:
train.columns.to_list()

In [None]:
#List for encoding

most_freq = ["author", 'geometry']

num_features = [
 'pressure [MPa]',
 'mass_flux [kg/m2-s]',
 'D_e [mm]',
 'D_h [mm]',
 'length [mm]',
 'chf_exp [MW/m2]']

In [None]:
# Define Encoders

encoders  = [ce.OneHotEncoder(),
                     ce.CatBoostEncoder(drop_invariant = True, return_df = True),
                     ce.OrdinalEncoder(drop_invariant = True),
                     ce.TargetEncoder(drop_invariant = True),
                     ce.WOEEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, 
                              handle_unknown='value', handle_missing='value', 
                              random_state=None, randomized=False, sigma=0.05, regularization=1.0)]

In [None]:
#Feature engineering steps for cross-validation
most_freq_pipe = Pipeline([('most_freq', SimpleImputer(strategy = "most_frequent")), ("one_hot",ce.OneHotEncoder() )])

num_mean_imputation =  Pipeline([('mean', SimpleImputer(strategy = "mean")),("standard_scsaler", StandardScaler())])                        
                           
                           
#Preprocessing pipeline
preprocessor = ColumnTransformer([('most_freq', most_freq_pipe, most_freq), ('mean_impute', num_mean_imputation, num_features)])

In [None]:
#Define the model
#Define models

models = {
            "xgb": xgb.XGBRegressor(),
            #"xgb_exact": xgb.XGBRegressor(**xgb_exact_params),
            #"xgb_approx": xgb.XGBRegressor(**xgb_approx_params),
            "lgb": lgb.LGBMRegressor(),
            #"lgb2": lgb.LGBMRegressor(**lgb2_params),
            #"lgb3": lgb.LGBMRegressor(**lgb3_params),
            "cat": CatBoostRegressor(),
            #"cat2": CatBoostRegressor(**cb2_params),
           # "cat3": CatBoostRegressor(**cb3_params),
            #"cat_sym": CatBoostRegressor(**cb_sym_params),
            #"cat_loss": CatBoostRegressor(**cb_loss_params),
            #"Ridge": RidgeCV(),
            #"Lasso": LassoCV(),
            #"RandomForestRegressor": RandomForestRegressor(n_estimators=200, random_state=self.random_state, n_jobs=-1),
            #"PLSRegression": PLSRegression(n_components=10, max_iter=2000),
            #"PassiveAggressiveRegressor": PassiveAggressiveRegressor(max_iter=3000, tol=1e-3, n_iter_no_change=30, random_state=self.random_state),
            #"GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05, loss="absolute_error", random_state=self.random_state),
           # "HistGradientBoostingRegressor": HistGradientBoostingRegressor(max_iter=self.n_estimators, learning_rate=0.01, loss="absolute_error", n_iter_no_change=300,random_state=self.random_state),
            #"ARDRegression": ARDRegression(n_iter=1000),
           # "HuberRegressor": HuberRegressor(max_iter=3000),
            #"KNeighborsRegressor": KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
        }


In [None]:
#Define the cross-vlidation strategy
#Cross validation strategy
seed = 42
splits = 3
#cv = RepeatedStratifiedKFold(n_splits = splits, n_repeats = 5, random_state = seed)
cv = KFold(n_splits = splits, random_state = seed, shuffle = True)


In [None]:
#Create the feature matrix and the target
train_y = train[target].values
train_X = train.drop(["target", "Train/Test"], axis = 1)

In [None]:
train_X 

In [None]:
type(train_y)

In [None]:
train_y.shape, train_X.shape

In [None]:
train_X.head()

In [None]:
#Cross-validation training loop

def cross_val_pipe(model, train_X = train_X, train_y = train_y, cv = cv, label = ''):
    
    
    #initiate prediction arrays and score lists
    train_predictions, val_predictions = np.zeros(train.shape[0]), np.zeros(train.shape[0])
   # val_predictions = np.zeros(train.shape[0]) # Validation predictions are stored in a matrix with length of the number of trainijng samples and # of preds
    train_score,val_score = [],[]
    
    pipe = make_pipeline(preprocessor, model)
    
    #training model, predicting prognosis probability, and evaluating log loss
    for fold, (train_idx, val_idx) in enumerate(cv.split(train_X)):
                
        pipe.fit(train_X.iloc[train_idx], train_y[train_idx])
        
        train_pred_ = pipe.predict(train_X.iloc[train_idx])
        val_pred_ = pipe.predict(train_X.iloc[val_idx])
                  
        train_predictions[train_idx] += train_pred_
        val_predictions[val_idx] += val_pred_
        
        train_score_ = mean_squared_error(train_y[train_idx], train_pred_, squared = False)
        val_score_ = mean_squared_error(train_y[val_idx], val_pred_, squared = False)
        train_score.append(train_score_)
        val_score.append(val_score_)
    
    print(f"The cross-validation score for {label} is {np.mean(val_score)}")
    print(f"The standard deviation for the RMSE is {np.std(val_score)}")
    return train_predictions, val_predictions, train_score, val_score

In [None]:
#model = xgb.XGBRegressor()
model = lgb.LGBMRegressor()

In [None]:
train_predictions, val_predictions, train_score, val_score = cross_val_pipe(model = model)

In [None]:
#Fine-tune a LGBM model

num_leaf = [16,32,64,128,256, 512, 1024, 2028]
max_leaf = [4,5,6,7,8,9,10]
min_data_in_leaves_list = [20,40,60,80,100]

In [None]:
def manual_tune_pipe(model, train_X = train_X, train_y = train_y, params = None, cv = cv, label = ''):
    
    
    #initiate prediction arrays and score lists
    train_predictions, val_predictions = np.zeros(train.shape[0]), np.zeros(train.shape[0])
   # val_predictions = np.zeros(train.shape[0]) # Validation predictions are stored in a matrix with length of the number of trainijng samples and # of preds
    train_score,val_score = [],[]
    
    model = model.set_params(**params)
    pipe = make_pipeline(preprocessor, model)
    
    #training model, predicting prognosis probability, and evaluating log loss
    for fold, (train_idx, val_idx) in enumerate(cv.split(train_X)):
                
        pipe.fit(train_X.iloc[train_idx], train_y[train_idx])
        
        train_pred_ = pipe.predict(train_X.iloc[train_idx])
        val_pred_ = pipe.predict(train_X.iloc[val_idx])
                  
        train_predictions[train_idx] += train_pred_
        val_predictions[val_idx] += val_pred_
        
        train_score_ = mean_squared_error(train_y[train_idx], train_pred_, squared = False)
        val_score_ = mean_squared_error(train_y[val_idx], val_pred_, squared = False)
        train_score.append(train_score_)
        val_score.append(val_score_)
    
    print(f"The cross-validation score for {label} is {np.mean(val_score)}")
    print(f"The number of leaves is {params}")
    return train_predictions, val_predictions, train_score, val_score

In [None]:
for param in min_data_in_leaves_list:
    train_predictions, val_predictions, train_score, val_score = manual_tune_pipe(model = model, params = {"max_depth":8,"num_leaves":32,"min_data_in_leaf":60 }, label = "LGBM")

In [None]:
val_score

In [None]:

    train_predictions, val_predictions, train_score, val_score = manual_tune_pipe(model = model, params = {"max_depth":8,"num_leaves":32,"min_data_in_leaf":60 }, label = "LGBM")

In [None]:
np.mean(val_score)

In [None]:
#Optuna - Hyperparameter

#LGBM Optuna
def objective(trial):
    
    lgb_params = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 10000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [4,6,10,15]),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
}
    global train_X
    global train_y
    
    train_x, valid_x, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.25, shuffle = True)
    
    #pipe.fit(train_X.iloc[train_idx], train_y[train_idx])
#     lg_train = lgb.Dataset(train_x, label = y_train)
#     lg_validation = lgb.Dataset(valid_x, label= y_valid)
#     evals_result_lgbm = {}
    
    model = lgb.LGBMRegressor(**lgb_params)
    pipe = make_pipeline(preprocessor, model)
#     model.fit(lgb_params, lg_train, 5000,
#                       valid_sets=[lg_train, lg_validation],
#                       early_stopping_rounds=100,
#                       verbose_eval=150,
#                       evals_result=evals_result_lgbm )
    pipe.fit(train_x, y_train)
    val_pred = pipe.predict(valid_x)
    val_metric = mean_squared_error(y_valid, val_pred, squared= False)
    
    return val_metric

In [None]:
import optuna
from optuna.integration import CatBoostPruningCallback

study = optuna.create_study( direction="minimize")
study.optimize(objective, n_trials=40 )

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
lgb_best_params = study.best_params

In [None]:
def pickle_save(object = None, file_name= "lgbm_params"):
    with open(file_name+'.pkl', 'wb') as f:  # open a text file
        pickle.dump(object, f) # serialize the list

In [None]:
def pickle_load(file_name= "Dummy.pkl", variable_name = "xxx"):
    with open(Path(working_dir)/file_name, 'rb') as f:  # open a text file
        variable_name = pickle.load(f) # serialize the list
        print(variable_name)

In [None]:
#Save the best model
pickle_save(object = lgb_best_params, file_name = "lgbm_best_params_v2")

In [None]:
final_model = lgb.LGBMRegressor(**lgb_best_params)

In [None]:
final_model.get_params

In [None]:
cross_val_pipe(model = model)

In [None]:
#final_data = lgb.Dataset(train_X, train_y)

In [None]:
final_pipeline = make_pipeline(preprocessor,final_model)

In [None]:
final_pipeline

In [None]:
final_pipeline_fitted = final_pipeline.fit(train_X, train_y)

In [None]:
test_final = test.set_index("id").drop(["Train/Test", "target"], axis = 1)

In [None]:
predictions = pd.DataFrame(final_pipeline_fitted.predict(test_final))

In [None]:
index = pd.DataFrame(test_final.index)

In [None]:
sub = pd.concat([ index, predictions], axis="columns")
sub.columns = ["id","x_e_out [-]"]

In [None]:
sub.to_csv("submission_lgb_opt_25.csv", index = False) 