# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! :)

Submission functions were compared in this [notebook](https://www.kaggle.com/code/hasanbasriakcay/tpsjun22-10xfastersubmissionfunction) 👑

In [None]:
import pandas as pd
import numpy as np
import warnings 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
warnings.simplefilter("ignore")

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
sub = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")
display(data.head())
display(sub.head())

In [None]:
print("data.shape:", data.shape)
print("sub.shape:", sub.shape)

In [None]:
display(data.isna().sum().sum())

In [None]:
def nan_analysis(df_train):
    from IPython.core.display import HTML
    ## train
    nan_cols = df_train.columns[df_train.isna().sum() > 0]
    df_train_nan = pd.DataFrame(index=nan_cols)
    df_train_nan['nan_counts'] = df_train[nan_cols].isna().sum()
    df_train_nan['nan_rate'] = df_train_nan['nan_counts'] / len(df_train)
    df_train_nan.sort_values("nan_rate", ascending=False, inplace=True)
    
    cm = sns.light_palette("red", as_cmap=True)
    table_list = [df_train_nan.style.background_gradient(cmap=cm)]
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [None]:
nan_analysis(data)

In [None]:
display(data.duplicated().sum())

In [None]:
int_features = list(data.select_dtypes(include='int').columns)
float_features = list(data.select_dtypes(include='float').columns)
object_features = list(data.select_dtypes(include='object').columns)
print("int len featres:", len(int_features))
print("float len featres:", len(float_features))
print("object len featres:", len(object_features))

In [None]:
low_variance_features = list(data.columns[data.nunique() < 100])
print("low_variance_features:",low_variance_features)
print("int_features:",int_features)

In [None]:
from IPython.core.display import HTML
def value_counts_all(df, columns):
    pd.set_option('display.max_rows', 50)
    table_list = []
    for col in columns:
        table_list.append(pd.DataFrame(df[col].value_counts()))
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [None]:
value_counts_all(data, int_features)

## Insights 1
* Integer features can be object features

# Distributions

In [None]:
def clf_plot_distributions(data, features, hue='target', ncols=3, method='hist'):
    nrows = int(len(features) / ncols) + 1
    fig, axes = plt.subplots(nrows, ncols, figsize=(16, round(nrows*16/ncols)))
    for ax,feature in zip(axes.ravel()[:len(features)],features):
        if method == 'hist':
            sns.kdeplot(data=data, x=feature, ax=ax)
        elif method == 'cdf':
            sns.ecdfplot(data=data, x=feature, ax=ax)
        elif method == 'box':
            sns.boxplot(data=data, x=feature, ax=ax)
        elif method == 'bar':
            temp = data.copy()
            temp['counts'] = 1
            temp = temp.groupby([feature], as_index=False).agg({'counts':'sum'})
            sns.barplot(data=temp, x=feature, y='counts', ax=ax)
        elif method == 'hbar':
            temp = data.copy()
            temp['counts'] = 1
            temp = temp.groupby([feature], as_index=False).agg({'counts':'sum'})
            sns.barplot(data=temp, y=feature, x='counts', ax=ax)
    for ax in axes.ravel()[len(features):]:
        ax.set_visible(False)
    fig.tight_layout()
    plt.show()

In [None]:
clf_plot_distributions(data, float_features, ncols=5, method='hist')

In [None]:
clf_plot_distributions(data, float_features, ncols=5, method='box')

## Insights 2
* F_1_7, F_1-12, F_1-13, F_3_19, F_3_21, F_4_2, F_4_3, F_4_8, F_4_9, F_4_10 and F_4_14 have outliers 

In [None]:
import gc

gc.collect()
#clf_plot_distributions(data, int_features, ncols=5, method='bar')

# Outlier Handling

In [None]:
data['F_1_7'].clip(-2.5, 2.5, inplace=True)
data['F_1_12'].clip(-2.5, 2.5, inplace=True)
data['F_1_13'].clip(-2.5, 2.5, inplace=True)
data['F_3_19'].clip(-2.5, 2.5, inplace=True)
data['F_3_21'].clip(-2.5, 2.5, inplace=True)
data['F_4_2'].clip(-2.5, 2.5, inplace=True)
data['F_4_3'].clip(-2.5, 2.5, inplace=True)
data['F_4_8'].clip(-2.5, 2.5, inplace=True)
data['F_4_9'].clip(-2.5, 2.5, inplace=True)
data['F_4_10'].clip(-2.5, 2.5, inplace=True)
data['F_4_14'].clip(-2.5, 2.5, inplace=True)

# Correlations

In [None]:
corr = data.corr()

In [None]:
matrix = np.triu(corr)
fig, ax = plt.subplots(figsize=(32, 32))
sns.heatmap(corr,  xticklabels=corr.columns, yticklabels=corr.columns, annot=True, mask=matrix, ax=ax, fmt='.1f');
ax.set_title("Correlations");

## Insights 3
* F_2 group has correlations with each other.
* F_4 group has correlations with each other.

In [None]:
def calculate_p_values_2d(df, columns, th=0.05):
    from scipy.stats import pearsonr
    import gc
    p_values = np.zeros((len(columns), len(columns)))
    for x, c_1 in enumerate(columns):
        for y, c_2 in enumerate(columns):
            if c_1 == c_2:
                p = 0.0
            else:
                temp_df = df[[c_1, c_2]]
                temp_df.dropna(axis=0, inplace=True)
                p = round(pearsonr(temp_df.loc[:,c_1], temp_df.loc[:,c_2])[1], 4)
                del temp_df
                gc.collect()
            p_values[x][y] = p
    
    p_values_df = pd.DataFrame(p_values, columns=columns, index=columns)
    return p_values_df

In [None]:
p_values_df = calculate_p_values_2d(data, data.columns, th=0.05)

In [None]:
fig, ax = plt.subplots(figsize=(32, 32))
sns.heatmap(p_values_df,  xticklabels=p_values_df.columns, yticklabels=p_values_df.columns, annot=True, mask=matrix, ax=ax, fmt='.1f');
ax.set_title("P-Values");

## Insights 4
* F_4 group has relationships with each other except F_4_3-F_4_4 and F_4_8-F_4_14.

# Missing Value Visualizations

In [None]:
import missingno as msno

sorted_data = data.sort_values("row_id")
msno.matrix(sorted_data);

In [None]:
# F_4_11 - F_4_8 
sorted_data = data.sort_values("F_4_11")
msno.matrix(sorted_data);

In [None]:
# F_4_11 - F_4_4
sorted_data = data.sort_values("F_4_4")
msno.matrix(sorted_data);

# Baseline Models

In [None]:
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

def rmse_score_multicol(df_y_true, df_y_pred):
    total_rmse = 0
    for col in df_y_pred.columns:
        temp_df = pd.DataFrame()
        temp_df['true'] = df_y_true[col].values
        temp_df['pred'] = df_y_pred[col].values
        temp_df.dropna(axis=0, inplace=True)
        total_rmse += mean_squared_error(temp_df['true'], temp_df['pred'], squared=False)
        del temp_df
        gc.collect()
    return total_rmse

def missing_transform_score(model_ori, data, features, nfold=10):
    kf = KFold(n_splits=nfold)
    total_score = 0
    for train_index, test_index in kf.split(data):
        X_train, X_test = data.loc[train_index, :].copy(), data.loc[test_index,:].copy()
        X_test_ori = X_test.copy()
        X_test.loc[:, features] = np.nan
        
        model = clone(model_ori)
        model.fit(X_train)
        X_test_tr = model.transform(X_test)
        X_test_tr = pd.DataFrame(X_test_tr, columns=X_test.columns)
        
        total_score += rmse_score_multicol(X_test_ori[features], X_test_tr[features])
        
        del X_train
        del X_test_tr
        del X_test_ori
        del X_test
        del model
        gc.collect()
    return total_score

## All Groups

In [None]:
imp_mean = SimpleImputer(strategy='mean')
score = missing_transform_score(imp_mean, data, float_features, nfold=10)
print("Mean Strategy Score:", score)
imp_mean = SimpleImputer(strategy='median')
score = missing_transform_score(imp_mean, data, float_features, nfold=10)
print("Median Strategy Score:", score)
imp_mean = SimpleImputer(strategy='most_frequent')
score = missing_transform_score(imp_mean, data, float_features, nfold=10)
print("Most Frequent Strategy Score:", score)

## F_1 Group

In [None]:
f_1_features = ['F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6', 'F_1_7',
                'F_1_8', 'F_1_9', 'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13', 'F_1_14']

In [None]:
imp_mean = SimpleImputer(strategy='mean')
score = missing_transform_score(imp_mean, data, f_1_features, nfold=10)
print("Mean Strategy Score:", score)
imp_mean = SimpleImputer(strategy='median')
score = missing_transform_score(imp_mean, data, f_1_features, nfold=10)
print("Median Strategy Score:", score)
imp_mean = SimpleImputer(strategy='most_frequent')
score = missing_transform_score(imp_mean, data, f_1_features, nfold=10)
print("Most Frequent Strategy Score:", score)

## F_3 Group

In [None]:
f_3_features = ['F_3_0', 'F_3_1', 'F_3_2', 'F_3_3', 'F_3_4', 'F_3_5', 'F_3_6', 'F_3_7', 'F_3_8', 
                'F_3_9', 'F_3_10', 'F_3_11', 'F_3_12', 'F_3_13', 'F_3_14', 'F_3_15', 'F_3_16', 
                'F_3_17', 'F_3_18', 'F_3_19', 'F_3_20', 'F_3_21', 'F_3_22', 'F_3_23', 'F_3_24']

In [None]:
imp_mean = SimpleImputer(strategy='mean')
score = missing_transform_score(imp_mean, data, f_3_features, nfold=10)
print("Mean Strategy Score:", score)
imp_mean = SimpleImputer(strategy='median')
score = missing_transform_score(imp_mean, data, f_3_features, nfold=10)
print("Median Strategy Score:", score)
imp_mean = SimpleImputer(strategy='most_frequent')
score = missing_transform_score(imp_mean, data, f_3_features, nfold=10)
print("Most Frequent Strategy Score:", score)

## F_4 Group

In [None]:
f_4_features = ['F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 'F_4_6', 'F_4_7', 
                'F_4_8', 'F_4_9', 'F_4_10', 'F_4_11', 'F_4_12', 'F_4_13', 'F_4_14']

In [None]:
imp_mean = SimpleImputer(strategy='mean')
score = missing_transform_score(imp_mean, data, f_4_features, nfold=10)
print("Mean Strategy Score:", score)
imp_mean = SimpleImputer(strategy='median')
score = missing_transform_score(imp_mean, data, f_4_features, nfold=10)
print("Median Strategy Score:", score)
imp_mean = SimpleImputer(strategy='most_frequent')
score = missing_transform_score(imp_mean, data, f_4_features, nfold=10)
print("Most Frequent Strategy Score:", score)

## Insights 5
* Mean imputing is generally a better solution than the median and most frequent imputing for all groups.

# Modeling

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import time

'''
imputer = IterativeImputer(estimator=HistGradientBoostingRegressor(learning_rate=0.05, max_leaf_nodes=25,
                                                                   max_iter=1000, min_samples_leaf=500,
                                                                   l2_regularization=1,
                                                                   validation_fraction=0.05,
                                                                   max_bins=63,
                                                                   random_state=3, verbose=0),
                           verbose=2, max_iter=20, initial_strategy='mean', imputation_order='random')
'''

imputer = IterativeImputer(estimator=xgboost.XGBRegressor(n_estimators=1000, learning_rate=0.05, tree_method='gpu_hist', predictor="gpu_predictor"),
                           verbose=2, max_iter=20, initial_strategy='mean')

start_time = time.clock()
imputed_df = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
print(time.clock() - start_time, "seconds")
imputed_df.head()

In [None]:
def automated_sub_func_melt(data, sub):
    features = ['F_1_0', 'F_1_1', 'F_1_2', 'F_1_3', 'F_1_4', 'F_1_5', 'F_1_6', 'F_1_7', 'F_1_8', 'F_1_9', 
                'F_1_10', 'F_1_11', 'F_1_12', 'F_1_13', 'F_1_14', 'F_3_0', 'F_3_1', 'F_3_2', 'F_3_3', 'F_3_4', 
                'F_3_5', 'F_3_6', 'F_3_7', 'F_3_8', 'F_3_9', 'F_3_10', 'F_3_11', 'F_3_12', 'F_3_13', 'F_3_14', 
                'F_3_15', 'F_3_16', 'F_3_17', 'F_3_18', 'F_3_19', 'F_3_20', 'F_3_21', 'F_3_22', 'F_3_23', 'F_3_24', 
                'F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 'F_4_6', 'F_4_7', 'F_4_8', 'F_4_9', 'F_4_10', 
                'F_4_11', 'F_4_12', 'F_4_13', 'F_4_14']
    melt_data = pd.melt(data, id_vars='row_id', value_vars=features, var_name='Column', value_name='Value')
    melt_data['row_id'] = melt_data['row_id'].astype(np.int32)
    melt_data['row-col'] = melt_data['row_id'].astype(str) + '-' + melt_data['Column']
    melt_data = melt_data.loc[melt_data['row-col'].isin(sub['row-col']), :]
    melt_data['sort1'] = melt_data['Column'].str.split('_', expand=True)[1]
    melt_data['sort1'] = melt_data['sort1'].astype(np.int8)
    melt_data['sort2'] = melt_data['Column'].str.split('_', expand=True)[2]
    melt_data['sort2'] = melt_data['sort2'].astype(np.int8)
    melt_data.sort_values(['row_id', 'sort1', 'sort2'], ascending=True, inplace=True)
    sub['value'] = melt_data['Value'].values
    return sub

def automated_sub_func_apply(data, sub):
    sub_temp = sub.copy()
    sub_temp[['row', 'col']] = sub_temp['row-col'].str.split('-', expand=True)
    sub_temp['row'] = sub_temp['row'].astype(int)
    sub_temp['value'] = sub_temp.apply(lambda row: data.loc[data['row_id']==row['row'], row['col']].values[0], axis=1)
    sub['value'] = sub_temp['value']
    return sub

In [None]:
import gc
del data
gc.collect()

start_time = time.clock()
new_sub = automated_sub_func_apply(imputed_df, sub.copy())
print(time.clock() - start_time, "seconds")
new_sub.to_csv("submission.csv", index=False)
new_sub.head()

# Insights


## Insights 1
* Integer features can be object features

## Insights 2
* F_1_17, F_1-12, F_1-13, F_3_19, F_3_21, F_4_2, F_4_3, F_4_8, F_4_9, F_4_10 and F_4_14 have outliers 

## Insights 3
* F_2 group has correlations with each other. Therefore, the F_2 group can be ordinal features.
* F_4 group has correlations with each other.

## Insights 4
* F_4 group has relationships with each other except F_4_3-F_4_4 and F_4_8-F_4_14.

## Insights 5
* Mean imputing is generally a better solution than the median and most frequent imputing for all groups.