# First Model and XGBOOST Model of the Abalone Dataset
## Import Modules and Import 

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBRegressor
import optuna

# load in datas
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e4/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e4/test.csv')


## Preprocessing

In [20]:
# simple one hot encode
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [21]:
display(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Length          90615 non-null  float64
 2   Diameter        90615 non-null  float64
 3   Height          90615 non-null  float64
 4   Whole weight    90615 non-null  float64
 5   Whole weight.1  90615 non-null  float64
 6   Whole weight.2  90615 non-null  float64
 7   Shell weight    90615 non-null  float64
 8   Rings           90615 non-null  int64  
 9   Sex_F           90615 non-null  bool   
 10  Sex_I           90615 non-null  bool   
 11  Sex_M           90615 non-null  bool   
dtypes: bool(3), float64(7), int64(2)
memory usage: 6.5 MB


None

## Model Class Definition

In [38]:


class model_class:
    def __init__(self, df_train, df_test, target = [], drop = []):
        self.df_train = df_train.drop(columns = drop)
        self.df_test = df_test.drop(columns = drop)
        self.target = target
        self.drop = drop

    def feature_engineering(self):
        pass

    def fit(self,params):
        prob_scores = []  
        for i in range(1):
            # the i iteration does not have a function yet, will be implemented later
            n_splits = 5
            mskf = StratifiedShuffleSplit(n_splits=n_splits, random_state=i)
            result = np.zeros((self.df_test.shape[0], (i+1)*n_splits))
            for itteration, (train_index, test_index) in enumerate(mskf.split(self.df_train, self.df_train[self.target])):
                X_train = self.df_train.loc[train_index].drop(columns = self.target)
                X_test = self.df_train.loc[test_index].drop(columns = self.target)
                y_train = self.df_train.loc[train_index][self.target]
                y_test = self.df_train.loc[test_index][self.target]
                model = XGBRegressor(**params)
                # make the value sof the x and y float 64
                X_train = X_train.astype('float64')
                y_train = y_train.astype('float64')
                model.fit(X_train, y_train)
                y_pred = model.predict(self.df_test)
                result[:, (i+1)*itteration] = y_pred

        return np.mean(result, axis=1)
    
    def root_mean_squared_log_error(self, y_true, y_pred):
        return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))
    
    def objective(self,trial):
        # xgboost regressor parameters
        xgbc_params = {}
        prediction = self.fit(xgbc_params)
        score = self.root_mean_squared_log_error(self.df_train[self.target], prediction)
        return score
    
    def predict(self, params):
        return self.fit(params)
    
    def find_params(self):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=10, n_jobs=7)
        return study.best_params
    

In [39]:
# predict
target_columns = ['Rings']
drop_columns = ['id']
model = model_class(df_train, df_test, target_columns, drop_columns)
# it was 445, i rduce to 50
xgb_params = {
            'n_estimators': 500,
            'max_depth': 6,
            'learning_rate': 0.0116,
            'colsample_bytree': 1,
            'subsample': 0.6085,
            'min_child_weight': 9,
            'reg_lambda': 4.879e-07,
            'max_bin': 431,
            'n_jobs': -1,
            'eval_metric': 'mae',
            'objective': "reg:absoluteerror",
            'tree_method': 'hist',
            'verbosity': 0,
            'random_state': 42,
        }

# make predictions and save them
prediction = model.fit(params=xgb_params)
predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), prediction)), columns = ['id', 'Rings'])

# save predictions
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e4/v1_simple_xgboost.csv', index = False)


### This performed a little worse then my simple xgboost. So iterating over subsets and combining them does not improve the integrated ROC corve