# Ensemble Learning
You should build a machine learning pipeline using an ensemble learning model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html). 
- Conduct data exploration, data preprocessing, and feature engineering if necessary. 
- Train and test an ensemble learning model, such as [random forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) or [gradient boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html).
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split




In [None]:
file_path = '/Users/pahomovskij/PycharmProjects/ml-gisma/datasets/mnist.csv'
mnist = pd.read_csv(file_path)


In [None]:


X = mnist.drop(['class', 'id'], axis=1)
y = mnist['class'] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

selector = SelectKBest(f_classif, k=10)  
X_train_selected = selector.fit_transform(X_train, y_train)


X_test_selected = selector.transform(X_test) 

print(f"Training set size: {X_train_selected.shape[0]} samples")
print(f"Test set size: {X_test_selected.shape[0]} samples")


In [None]:

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

param_distributions = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],  
        "max_depth": [None, 10, 20, 30], 
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],  
        "learning_rate": [0.01, 0.1, 0.2], 
        "max_depth": [3, 5, 7],          
    },
}

cv = KFold(n_splits=5, shuffle=True, random_state=42) 

results = []
for name, model in models.items():
    randomized_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions[name],
        n_iter=12,
        cv=cv,
        n_jobs=-1,
        random_state=42,

    ).fit(X_train, y_train)
    
    results.append(
        {
            "model": name,
            "best_params": randomized_search.best_params_,
            "best_score": randomized_search.best_score_,
        }
    )


for result in results:
    print(f"Model: {result['model']}")
    print(f"Best Params: {result['best_params']}")
    print(f"Best CV Score: {result['best_score']}\n")