<a href="https://www.kaggle.com/code/shreeyashah/pca-optuna-digits-recognizer?scriptVersionId=285862707" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the Data

In [None]:
df_train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
df_test.head()

## Exploratory Data Analysis

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.isna().sum()

In [None]:
sns.countplot(x=df_train['label'])
plt.title('Digits Distribution')
plt.show()

In [None]:
X_train = df_train.iloc[:,1:]
y_train = df_train.iloc[:,0]
X_test = df_test

## Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
X_train_temp = pca.fit_transform(X_train)

In [None]:
pca.explained_variance_ratio_

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
#Selecting the number of components such that 90% variance is explained
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs Components')
plt.xticks(range(0,800,50))
plt.legend()
plt.grid(True)
plt.show()

In [None]:
pca = PCA(n_components = 100)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## Hyperparameter Tuning with Optuna 

In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Defining the objective function for Optuna
def objective (trial):
    classifier_name = trial.suggest_categorical('classifier', ['CatBoost', 'XGBoost'])

    if classifier_name == "CatBoost":
        # Cat Boost hyperparameters
        depth = trial.suggest_int("cb_depth", 3, 10)
        learning_rate = trial.suggest_float("cb_learning_rate", 0.01, 0.3, log=True)
        iterations = trial.suggest_int("cb_iterations", 50, 300)
        
        model = CatBoostClassifier(
            depth=depth,
            learning_rate=learning_rate,
            iterations=iterations,
            verbose=0,     
            random_seed=42
        )

    elif classifier_name == "XGBoost":
        # XGBoost hyperparameters
        n_estimators = trial.suggest_int("xgb_n_estimators", 50, 300)
        learning_rate = trial.suggest_float("xgb_learning_rate", 0.01, 0.3, log=True)
        max_depth = trial.suggest_int("xgb_max_depth", 3, 20)
        subsample = trial.suggest_float("xgb_subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0)
        
        model = XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            eval_metric="logloss",
            tree_method="auto",
            random_state=42,
            n_jobs=-1
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [None]:
import optuna

# Create a study and optimize it using CmaEsSampler
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 10)

In [None]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

In [None]:
study.trials_dataframe()

In [None]:
study.trials_dataframe()['params_classifier'].value_counts()

In [None]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

In [None]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [None]:
plot_optimization_history(study).show()

In [None]:
plot_slice(study).show()

## Making Predictions

In [None]:
best_params = best_trial.params
clf_name = best_params['classifier']

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

if clf_name == 'XGBoost':
    best_model = XGBClassifier(
        n_estimators=best_params['xgb_n_estimators'],
        learning_rate=best_params['xgb_learning_rate'],
        max_depth=best_params['xgb_max_depth'],
        subsample=best_params['xgb_subsample'],
        colsample_bytree=best_params['xgb_colsample_bytree'],
        eval_metric='logloss',
        tree_method="auto",
        random_state=42,
        n_jobs=-1
    )
        
elif clf_name == 'CatBoost':
    best_model = CatBoostClassifier(
        iterations=best_params['cb_iterations'],
        depth=best_params['cb_depth'],
        learning_rate=best_params['cb_learning_rate'],
        random_seed=42,
        verbose=0
    )


In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
y_pred = y_pred.flatten()

In [None]:
id = np.array(range(1,len(y_pred)+1))
id

In [None]:
results = pd.DataFrame({'ImageId':id,'Label':y_pred})
print(results)
results.to_csv('submission.csv', index=False)