In [None]:
%%capture
!pip -q install autogluon

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error,root_mean_squared_log_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor,CatBoostClassifier
import optuna
from sklearn.model_selection import train_test_split,RepeatedKFold
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

In [None]:
def rmsle(y_true, y_pred):
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return rmsle

RMSLE_scorer = make_scorer(
    name='root_mean_squared_log_error',
    score_func=root_mean_squared_log_error,
    greater_is_better=False,
    optimum=0
)

In [None]:
data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
data.head(3)

In [None]:
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

In [None]:
from sklearn.mixture import GaussianMixture

def  get_gmm_class_features(feat,n):
    gmm=GaussianMixture(n_components=n,random_state=42)
    gmm.fit(data[feat].fillna(data[feat].median()).values.reshape(-1,1))
    data[f'{feat}_class']=gmm.predict(data[feat].fillna(data[feat].median()).values.reshape(-1,1))
    test[f'{feat}_class']=gmm.predict(test[feat].fillna(test[feat].median()).values.reshape(-1,1))
    
    
get_gmm_class_features('Length',5)
get_gmm_class_features('Diameter',4)
get_gmm_class_features('Height',5)
get_gmm_class_features('Whole weight',5)
get_gmm_class_features('Whole weight.1',6)
get_gmm_class_features('Whole weight.2',6)
get_gmm_class_features('Shell weight',5)

In [None]:
data.isna().sum()

In [None]:
data.Rings.value_counts().plot(kind='barh')

In [None]:
class Model:
    def __init__(self, train, test):
        self.train = train
        self.test = TabularDataset(test.drop('id',axis=1))
        self.model_dict = dict()
        self.test_predict_list = list()
        
    def fit(self):
        target_col = ['Rings']
        drop_col = ['id']
        
        train_cols = [col for col in self.train.columns.to_list() if col not in drop_col]
        scores = list()
        fold_scores = list()
        
        for i in range(1):
            mskf = RepeatedKFold(n_splits=20, n_repeats=1,random_state=256)
            oof_valid_preds = np.zeros(self.train[train_cols].shape[0])
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, X_valid = self.train[train_cols].iloc[train_idx], self.train[train_cols].iloc[valid_idx]
                
                X_train = TabularDataset(X_train)
                X_valid = TabularDataset(X_valid)
                
                model = TabularPredictor(label='Rings',eval_metric =RMSLE_scorer,
                            problem_type="regression").fit(X_train,presets='best_quality',
                                                                    time_limit=2000)
                    
                valid_preds = model.predict(X_valid)
                oof_valid_preds[valid_idx] = valid_preds
                test_predict = model.predict(self.test)
                self.test_predict_list.append(test_predict)
                score = np.sqrt(mean_squared_log_error(np.array(self.train[target_col].iloc[valid_idx]),valid_preds))
                self.model_dict[f'fold_{fold}'] = model
                fold_scores.append(score)    
            oof_score = np.sqrt(mean_squared_log_error(self.train[target_col], oof_valid_preds))
            print(f"The OOF RMSLE for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        print(f"The average RMSLE is {np.mean(oof_score)}")
        return fold_scores,self.test_predict_list

In [None]:
model = Model(data,test)
scores,preds = model.fit()

## Reference for model weighting:

https://www.kaggle.com/code/beezus666/ensemble-weighted-average

In [None]:
def model_weight(model_loss, worst_loss, best_loss):
    if model_loss == best_loss:
        return 1.0
    elif model_loss == worst_loss:
        return 0.0
    else:
        return 1.0 - ((best_loss - model_loss) / (best_loss - worst_loss))

In [None]:
# Calculate the best and worst losses
best_loss = min(scores)
worst_loss = max(scores)

# Calculate the weights for each fold's test predictions
fold_weights = [model_weight(score, worst_loss, best_loss) for score in scores]

# Normalize the weights so that they add up to 1
total_weight = sum(fold_weights)
fold_weights = [w / total_weight for w in fold_weights]

# Calculate the weighted average of the test predictions
weighted_test_predict = np.zeros(test.shape[0])
for i in range(len(preds)):
    weighted_test_predict += preds[i] * fold_weights[i]

In [None]:
gluon_preds = pd.read_csv('/kaggle/input/gluon-preds/submission.csv')
gluon_preds_array = np.array(gluon_preds.drop('id',axis=1))

In [None]:
prediction = weighted_test_predict*[0.40]+gluon_preds_array.ravel()*[0.60]
submit = pd.DataFrame({'id':test.id,
                      'Rings':prediction.round(2)})
submit.to_csv('submission.csv',index=False)
submit