In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# STATISTICS
from statsmodels.graphics.gofplots import qqplot

# HYPOTHESIS TESTING
from scipy.stats import ks_2samp

# Sklearn preprocessing
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from xgboost import XGBRegressor

#notebook settings
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
# Load data
train = pd.read_csv('data/train.csv',index_col=0)
test = pd.read_csv('data/test.csv',index_col=0)
sub = pd.read_csv('data/sample_submission.csv',index_col=0)

In [3]:
# summary statistics
summary_table = pd.DataFrame(train.dtypes, columns=['dtypes'])
summary_table['Missing'] = train.isnull().sum()
summary_table['Unique'] = train.nunique()
summary_table['Count'] = train.count()
summary_table['Duplicated'] = train.duplicated().sum()
summary_table

Unnamed: 0,dtypes,Missing,Unique,Count,Duplicated
Sex,object,0,3,90615,0
Length,float64,0,157,90615,0
Diameter,float64,0,126,90615,0
Height,float64,0,90,90615,0
Whole weight,float64,0,3175,90615,0
Whole weight.1,float64,0,1799,90615,0
Whole weight.2,float64,0,979,90615,0
Shell weight,float64,0,1129,90615,0
Rings,int64,0,28,90615,0


In [4]:
# check the distribution of sex
categorical_features = train.select_dtypes(include=['object']).copy()
categorical_features.value_counts(normalize=True)

Sex
I      0.365204
M      0.342405
F      0.292391
Name: proportion, dtype: float64

In [5]:
numerical_features = train.select_dtypes(exclude=['object']).copy()

In [6]:
def plot_numerical_features(data):
    """
    Plot the distribution, boxplot, and Gaussianity of numerical features in the given dataset.

    Args:
        data (pandas.DataFrame): The dataset containing numerical features.

    Returns:
        None
    """
    data = data.copy()
    
    # drop target variable, customer id, and surname
    data.drop(columns=['Rings'], inplace=True)
    
    X_num = data.select_dtypes(exclude='object')
    
    for numerical_feature in data.columns:
        
        # Creating three subplots per numerical_feature
        fig, ax =plt.subplots(1,3,figsize=(15,3))
        
        # Histogram to get an overview of the distribution of each numerical_feature
        ax[0].set_title(f"Distribution of: {numerical_feature}")
        sns.histplot(data = X_num, x = numerical_feature, kde=True, ax = ax[0])
        
        # Boxplot to detect outliers
        ax[1].set_title(f"Boxplot of: {numerical_feature}")
        sns.boxplot(data = X_num, x = numerical_feature, ax=ax[1])
        
        # Analyzing whether a feature is normally distributed or not
        ax[2].set_title(f"Gaussianity of: {numerical_feature}")
        qqplot(X_num[numerical_feature],line='s',ax=ax[2])


In [7]:
#plot_numerical_features(numerical_features)

In [8]:
def choose_scaler_for_features(data):
    """
    Choose the appropriate scaler for each feature in a DataFrame.

    Parameters:
    data (DataFrame): DataFrame containing numerical features.

    Returns:
    dict: Dictionary mapping feature names to recommended scalers.
    """

    # Initialize a dictionary to store recommended scalers for each feature
    scalers = {}

    # Loop through each feature
    for feature in data.columns:
        feature_data = data[feature]

        # Fit data with different scalers
        minmax_scaled = MinMaxScaler().fit_transform(feature_data.values.reshape(-1, 1)).flatten()
        standard_scaled = StandardScaler().fit_transform(feature_data.values.reshape(-1, 1)).flatten()
        robust_scaled = RobustScaler().fit_transform(feature_data.values.reshape(-1, 1)).flatten()

        # Perform Kolmogorov-Smirnov test between original and scaled data
        ks_minmax = ks_2samp(feature_data, minmax_scaled).statistic
        ks_standard = ks_2samp(feature_data, standard_scaled).statistic
        ks_robust = ks_2samp(feature_data, robust_scaled).statistic

        # Choose the scaler with the lowest KS statistic
        if ks_minmax <= ks_standard and ks_minmax <= ks_robust:
            scalers[feature] = "minmax"
        elif ks_standard <= ks_minmax and ks_standard <= ks_robust:
            scalers[feature] = "standard"
        else:
            scalers[feature] = "robust"

    return scalers

In [9]:
X = train.drop('Rings', axis=1)

# log transform the target variable
y = np.log1p(train['Rings'])

In [10]:
X_num = X.select_dtypes(exclude='object')

# Choose scalers for features
scalers = choose_scaler_for_features(X_num)

print("Recommended scalers for each feature:")
scalers

Recommended scalers for each feature:


{'Length': 'minmax',
 'Diameter': 'standard',
 'Height': 'minmax',
 'Whole weight': 'standard',
 'Whole weight.1': 'minmax',
 'Whole weight.2': 'minmax',
 'Shell weight': 'minmax'}

In [11]:
# Preprocessing pipeline for numerical features
numerical_pipeline = Pipeline([
    ('scaler', ColumnTransformer([
        ('minmax', MinMaxScaler(), list(scalers.keys())[0:4]),
        ('standard', StandardScaler(), list(scalers.keys())[4:8]),
        ('robust', RobustScaler(), list(scalers.keys())[8:])
    ])
    )
])

# Preprocessing pipeline for categorical features
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical pipelines
preprocessing_baseline = ColumnTransformer([
    ('numerical', numerical_pipeline, X_num.columns),
    ('categorical', categorical_pipeline, X.select_dtypes(include='object').columns)
])

# Fit and transform the training data
X_preprocessed = preprocessing_baseline.fit_transform(X)

# Transform the test data
X_test_preprocessed = preprocessing_baseline.transform(test)

In [12]:
preprocessing_baseline

In [13]:
pipe_baseline = Pipeline([
    ('preprocessing', preprocessing_baseline),
    ('model', XGBRegressor(random_state=42))
])

pipe_baseline

In [14]:
score_baseline = -1 * cross_val_score(pipe_baseline, 
                                 X, 
                                 y, 
                                 cv=KFold(n_splits=5, shuffle=True, random_state=42),
                                 scoring='neg_root_mean_squared_error').mean()

score_baseline

0.15060099735787963

In [15]:
#baseline predictions

pipe_baseline.fit(X, y)

y_pred_baseline = np.expm1(pipe_baseline.predict(test))

y_pred_baseline

array([ 9.59538  ,  9.54032  ,  9.979922 , ..., 11.797348 , 13.086581 ,
        7.9991846], dtype=float32)

In [16]:
# Create a DataFrame for the submission data
submission_data = pd.DataFrame(y_pred_baseline, index=test.index, columns=['Rings'])

# Save the DataFrame to a CSV file
submission_data.to_csv('outputs/submission_baseline_log.csv')

kaggle results `0.14855`