In [139]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
  
# data (as pandas dataframes) 
X = rice_cammeo_and_osmancik.data.features 
y = rice_cammeo_and_osmancik.data.targets
   
# Combine X and y into a single DataFrame
df = pd.concat([X, y], axis=1)

# Rename the target column to 'Y'
df = df.rename(columns={df.columns[-1]: 'Y'})

In [140]:
import pandas as pd
import numpy as np

Y_column = df['Y'].copy()
df.drop('Y', axis=1, inplace=True)

# Identify categorical data (change this based on your actual data)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Standardize only the continuous (non-categorical) columns
continuous_cols = df.columns.difference(categorical_cols)  # Gets the difference, i.e., continuous cols
df[continuous_cols] = (df[continuous_cols] - df[continuous_cols].mean()) / df[continuous_cols].std()

# Filter out outliers in continuous data (|z-score| > 5)
mask = (np.abs(df[continuous_cols]) < 10).all(axis=1)
df = df[mask]

# Reattach the target variable 'Y' to the DataFrame
df['Y'] = Y_column[mask]

In [141]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for column in categorical_cols:
    # Ensure the column is of type object (string) or category
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = le.fit_transform(df[column])
        
df['Y'], unique = pd.factorize(df['Y'])


In [142]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, PowerTransformer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import KernelPCA
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for Box-Cox (handles non-positive data)
class SafeBoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lambdas = None
        self.min_values = None

    def fit(self, X, y=None):
        self.min_values = X.min(axis=0)
        X_shifted = X - self.min_values + 1  # Shift data to be strictly positive
        _, self.lambdas = stats.boxcox(X_shifted)
        return self

    def transform(self, X):
        X_shifted = X - self.min_values + 1
        return np.column_stack([stats.boxcox(X_shifted[:, i], self.lambdas[i]) for i in range(X.shape[1])])

def run_analysis(X, y, is_binary):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns

    # Define transformations
    transformations = {
        'standard': StandardScaler(),
        'minmax': MinMaxScaler(),
        'robust': RobustScaler(),
        'yeojohnson': PowerTransformer(method='yeo-johnson'),
        'mice': IterativeImputer(),
        'kernel_pca': KernelPCA(n_components=min(5, len(numeric_features)), kernel='rbf'),
        'no_scaling': None
    }

    # Define models with configurations for both binary and multi-class
    models = {
        'Logistic Regression': LogisticRegression(multi_class='ovr' if not is_binary else 'auto', max_iter=1000),
        'KNN': KNeighborsClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss' if is_binary else 'mlogloss'),
        'LightGBM': LGBMClassifier(objective='binary' if is_binary else 'multiclass'),
        'CatBoost': CatBoostClassifier(verbose=0)
    }

    results = []

    for trans_name, transformer in transformations.items():
        for model_name, model in models.items():
            try:
                # Create pipeline
                if transformer:
                    numeric_transformer = Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', transformer)
                    ])
                    
                    categorical_transformer = Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                    ])
                    
                    preprocessor = ColumnTransformer(
                        transformers=[
                            ('num', numeric_transformer, numeric_features),
                            ('cat', categorical_transformer, categorical_features)
                        ])
                    
                    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                               ('classifier', model)])
                else:
                    pipeline = model
                
                # Fit and predict
                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict(X_test)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                
                # For AUC, we need probability predictions
                if hasattr(pipeline, "predict_proba"):
                    y_pred_proba = pipeline.predict_proba(X_test)
                    if is_binary:
                        # Binary classification
                        auc = roc_auc_score(y_test, y_pred_proba[:, 1])
                    else:
                        # Multi-class classification
                        auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
                else:
                    auc = np.nan
                
                results.append({
                    'Transformation': trans_name,
                    'Model': model_name,
                    'Accuracy': accuracy,
                    'AUC': auc
                })
            except Exception as e:
                print(f"Error with {trans_name} transformation and {model_name}: {str(e)}")
                continue

    return pd.DataFrame(results)

# Assume df is your DataFrame and 'Y' is your target column
X = df.drop('Y', axis=1)
y = df['Y']

# Encode target variable if it's categorical
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Get number of classes
n_classes = len(np.unique(y))
is_binary = n_classes == 2

# Run analysis
results_df = run_analysis(X, y, is_binary)

# Display results
print(results_df)

# Find best performing combination
best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
best_auc = results_df.loc[results_df['AUC'].idxmax()]

print("\nBest Accuracy:")
print(best_accuracy)
print("\nBest AUC:")
print(best_auc)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1768, number of negative: 1280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3048, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.580052 -> initscore=0.322989
[LightGBM] [Info] Start training from score 0.322989
   Transformation                Model  Accuracy       AUC
0        standard  Logistic Regression  0.929134  0.982240
1        standard                  KNN  0.905512  0.959702
2        standard        Decision Tree  0.868766  0.866602
3        standard        Random Forest  0.923885  0.977705
4        standard    Gradient Boosting  0.923885  0.979386
5        standard              XGBoost  0.918635  0.974355
6        standard             LightGBM  0.918635  0.975978
7        standard             CatBoost  

In [143]:
import pandas as pd
import numpy as np

# Assuming results_df is your DataFrame with the results

# Function to compare with no_scaling
def compare_with_no_scaling(df):
    no_scaling = df[df['Transformation'] == 'no_scaling'].set_index('Model')
    
    def diff(group):
        transformation = group['Transformation'].iloc[0]
        if transformation == 'no_scaling':
            group['Accuracy_Diff'] = 0
            group['AUC_Diff'] = 0
        else:
            model = group['Model'].iloc[0]
            group['Accuracy_Diff'] = group['Accuracy'] - no_scaling.loc[model, 'Accuracy']
            group['AUC_Diff'] = group['AUC'] - no_scaling.loc[model, 'AUC']
        return group
    
    return df.groupby(['Transformation', 'Model']).apply(diff).reset_index(drop=True)

# Compare each method with no_scaling
compared_results = compare_with_no_scaling(results_df)

# Calculate average improvement for each transformation
avg_improvement = compared_results[compared_results['Transformation'] != 'no_scaling'].groupby('Transformation').agg({
    'Accuracy_Diff': 'mean',
    'AUC_Diff': 'mean'
}).sort_values('Accuracy_Diff', ascending=False)

print("Average improvement by transformation:")
print(avg_improvement)

# Find transformations that increase performance on average
improved_transformations = avg_improvement[(avg_improvement['Accuracy_Diff'] > 0) | (avg_improvement['AUC_Diff'] > 0)]

print("\nTransformations that increase performance on average:")
print(improved_transformations)

# Find the best transformation for each model
best_transformations = compared_results[compared_results['Transformation'] != 'no_scaling'].groupby('Model').apply(
    lambda x: x.loc[x['Accuracy_Diff'].idxmax()]
).reset_index(drop=True)

print("\nBest transformation for each model:")
print(best_transformations[['Model', 'Transformation', 'Accuracy_Diff', 'AUC_Diff']])

# Overall best performing model and transformation
best_overall = compared_results.loc[compared_results['Accuracy'].idxmax()]
print("\nOverall best performing model and transformation:")
print(best_overall[['Transformation', 'Model', 'Accuracy', 'AUC', 'Accuracy_Diff', 'AUC_Diff']])

Average improvement by transformation:
                Accuracy_Diff  AUC_Diff
Transformation                         
minmax               0.001312  0.000299
yeojohnson           0.001312  0.000931
mice                 0.000492  0.000353
robust              -0.000328  0.000324
standard            -0.000492 -0.000875
kernel_pca          -0.002953 -0.002209

Transformations that increase performance on average:
                Accuracy_Diff  AUC_Diff
Transformation                         
minmax               0.001312  0.000299
yeojohnson           0.001312  0.000931
mice                 0.000492  0.000353
robust              -0.000328  0.000324

Best transformation for each model:
                 Model Transformation  Accuracy_Diff  AUC_Diff
0             CatBoost           mice       0.000000  0.000000
1        Decision Tree     kernel_pca       0.010499  0.010784
2    Gradient Boosting           mice       0.000000 -0.000055
3                  KNN     kernel_pca       0.013123 -0.0