# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X = X.copy()
        
        X['timestamp'] = pd.to_datetime(X['timestamp'])
        
        X['hour'] = X['timestamp'].dt.hour
        X['dayofweek'] = X['timestamp'].dt.weekday
        
        X = X.drop('timestamp', axis=1)
        
        return X

In [3]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_column) -> None:
        self.target = target_column
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.categorical_features = None
        self.columns_to_encode = []

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        df = X.copy()
        X_df = df.drop(columns=self.target)
        y = df[self.target]

        # Определяем категориальные признаки
        if self.categorical_features is None:
            # Автоматическое определение: object или category dtype
            self.columns_to_encode = X_df.select_dtypes(include=['object', 'category']).columns.tolist()
        else:
            self.columns_to_encode = self.categorical_features
        
        if len(self.columns_to_encode) > 0:
            # Обучаем OneHotEncoder на этих признаках
            self.encoder.fit(X_df[self.columns_to_encode])

        if len(self.columns_to_encode) == 0:
            return X_df, y
        
        # Применяем OneHotEncoder
        encoded_data = self.encoder.transform(X_df[self.columns_to_encode])
        
        # Создаем имена для новых колонок
        feature_names = []
        for i, col in enumerate(self.columns_to_encode):
            for cat in self.encoder.categories_[i]:
                feature_names.append(f"{col}_{cat}")
        
        # Создаем DataFrame с закодированными признаками
        encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=X_df.index)
        
        # Удаляем исходные категориальные колонки
        X_df = X_df.drop(columns=self.columns_to_encode)
        
        # Объединяем с закодированными
        X_df = pd.concat([X_df, encoded_df], axis=1)
        
        return X_df, y


In [4]:
class TrainValidationTest:
    def __init__(self) -> None:
        pass

    def fit(self, X, y):
        self.X = X
        self.y = y
        
        return self

    def transform(self):
        X_train1, X_test, y_train1, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=21, stratify=self.y)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train1, y_train1, test_size=0.2, random_state=21, stratify=y_train1)

        return X_train, X_valid, X_test, y_train, y_valid, y_test
    
    def fit_transform(self, X, y):
        self.fit(X, y)

        X_train, X_valid, X_test, y_train, y_valid, y_test = self.transform()
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [5]:
class ModelSelection:
    def __init__(self, grids, grid_dict) -> None:
        self.grids = grids
        self.grid_dict = grid_dict
        self.best = pd.DataFrame()

    def choose(self, X_train, y_train, X_valid, y_valid):
        i = 0
        res = pd.DataFrame(columns=['model', 'params', 'valid_score'])
        for gs in self.grids:
            print(f'{i + 1}/{len(self.grid_dict)} | {gs.estimator}', end='')
            gs.fit(X_train, y_train)
            results_df = pd.DataFrame(gs.cv_results_)
            results_df = results_df.sort_values('rank_test_score')
            results_df = results_df.reset_index()
            best_params = results_df['params'].loc[0]
            model = gs.estimator
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            pred = model.predict(X_valid)
            acc = accuracy_score(y_valid, pred)

            print(' | Done')
            print(f'Best params: {best_params}')
            print(f'Best training accuracy: {results_df['mean_test_score'].loc[0]}')
            print(f'Validation set accuracy score for best params: {acc}') 
            print()

            res.loc[len(res)] = [self.grid_dict[i], best_params, acc]
            i += 1
        
        self.best = res
    
    def best_results(self):
        if len(self.best) > 0:
            return self.best
        else:
            return 'No models checked'

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [6]:
import joblib 

class Finalize:
    def __init__(self, estimator) -> None:
        self.model = estimator

    def final_score(self, X_train, y_train, X_test, y_test):
        self.model.fit(X_train, y_train)
        pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)

        return f'Accuracy of the final model is {accuracy}'

    def save_model(self, path):
        joblib.dump(self.model, path)

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [7]:
df = pd.read_csv('../data/checker_submits.csv')

In [8]:
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])

In [9]:
data = preprocessing.fit_transform(df)

In [10]:
tvt = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = tvt.fit_transform(data[0], data[1])

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1078 entries, 862 to 687
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1078 non-null   int64  
 1   hour              1078 non-null   int32  
 2   uid_user_0        1078 non-null   float64
 3   uid_user_1        1078 non-null   float64
 4   uid_user_10       1078 non-null   float64
 5   uid_user_11       1078 non-null   float64
 6   uid_user_12       1078 non-null   float64
 7   uid_user_13       1078 non-null   float64
 8   uid_user_14       1078 non-null   float64
 9   uid_user_15       1078 non-null   float64
 10  uid_user_16       1078 non-null   float64
 11  uid_user_17       1078 non-null   float64
 12  uid_user_18       1078 non-null   float64
 13  uid_user_19       1078 non-null   float64
 14  uid_user_2        1078 non-null   float64
 15  uid_user_20       1078 non-null   float64
 16  uid_user_21       1078 non-null   float64
 17 

In [12]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 1078 entries, 862 to 687
Series name: dayofweek
Non-Null Count  Dtype
--------------  -----
1078 non-null   int32
dtypes: int32(1)
memory usage: 12.6 KB


In [13]:
X_valid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 1053 to 744
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         270 non-null    int64  
 1   hour              270 non-null    int32  
 2   uid_user_0        270 non-null    float64
 3   uid_user_1        270 non-null    float64
 4   uid_user_10       270 non-null    float64
 5   uid_user_11       270 non-null    float64
 6   uid_user_12       270 non-null    float64
 7   uid_user_13       270 non-null    float64
 8   uid_user_14       270 non-null    float64
 9   uid_user_15       270 non-null    float64
 10  uid_user_16       270 non-null    float64
 11  uid_user_17       270 non-null    float64
 12  uid_user_18       270 non-null    float64
 13  uid_user_19       270 non-null    float64
 14  uid_user_2        270 non-null    float64
 15  uid_user_20       270 non-null    float64
 16  uid_user_21       270 non-null    float64
 17 

In [14]:
y_valid.info()

<class 'pandas.core.series.Series'>
Index: 270 entries, 1053 to 744
Series name: dayofweek
Non-Null Count  Dtype
--------------  -----
270 non-null    int32
dtypes: int32(1)
memory usage: 3.2 KB


In [15]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338 entries, 1087 to 1243
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         338 non-null    int64  
 1   hour              338 non-null    int32  
 2   uid_user_0        338 non-null    float64
 3   uid_user_1        338 non-null    float64
 4   uid_user_10       338 non-null    float64
 5   uid_user_11       338 non-null    float64
 6   uid_user_12       338 non-null    float64
 7   uid_user_13       338 non-null    float64
 8   uid_user_14       338 non-null    float64
 9   uid_user_15       338 non-null    float64
 10  uid_user_16       338 non-null    float64
 11  uid_user_17       338 non-null    float64
 12  uid_user_18       338 non-null    float64
 13  uid_user_19       338 non-null    float64
 14  uid_user_2        338 non-null    float64
 15  uid_user_20       338 non-null    float64
 16  uid_user_21       338 non-null    float64
 17

In [16]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 338 entries, 1087 to 1243
Series name: dayofweek
Non-Null Count  Dtype
--------------  -----
338 non-null    int32
dtypes: int32(1)
memory usage: 4.0 KB


In [17]:
y_train.value_counts()

dayofweek
3    253
6    228
1    175
5    174
2     95
0     87
4     66
Name: count, dtype: int64

In [18]:
y_valid.value_counts()

dayofweek
3    63
6    57
1    44
5    43
2    24
0    22
4    17
Name: count, dtype: int64

In [19]:
y_test.value_counts()

dayofweek
3    80
6    71
1    55
5    54
2    30
0    27
4    21
Name: count, dtype: int64

In [20]:
svm = SVC()
dtree = DecisionTreeClassifier()
rfor = RandomForestClassifier()

svm_params = [
    {
        'kernel':['linear', 'rbf', 'sigmoid'], 
        'C':[0.01, 0.1, 1, 1.5, 5, 10], 
        'gamma': ['scale', 'auto'], 
        'class_weight':['balanced', None], 
        'random_state':[21], 
        'probability':[True]
    }
]
gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=4) 

dtree_params = [
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'class_weight': [None, 'balanced'],
        'random_state': [21]
    }
]
gs_tree = GridSearchCV(estimator=dtree, param_grid=dtree_params, scoring='accuracy', cv=2, n_jobs=4) 

rfor_params = [
    {
        'n_estimators': [50, 100, 120],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False],
        'class_weight': [None, 'balanced'],
        'random_state': [21],
    }
]
gs_rf = GridSearchCV(estimator=rfor, param_grid=rfor_params, scoring='accuracy', cv=2, n_jobs=4) 

grids = [gs_svm, gs_tree, gs_rf]

grid_dict = {
    0: 'SVM',
    1: 'Desicion Tree',
    2: 'Random Forest'
}

ms = ModelSelection(grids, grid_dict)

In [21]:
ms.choose(X_train, y_train, X_valid, y_valid)

1/3 | SVC() | Done
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.7727272727272727
Validation set accuracy score for best params: 0.8777777777777778

2/3 | DecisionTreeClassifier() | Done
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 21, 'splitter': 'random'}
Best training accuracy: 0.8311688311688312
Validation set accuracy score for best params: 0.8814814814814815

3/3 | RandomForestClassifier() | Done
Best params: {'bootstrap': False, 'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 21}
Best training accuracy: 0.8682745825602969
Validation set accuracy score for best params: 0.8888888888888888



In [22]:
ms.best_results()

Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.877778
1,Desicion Tree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.881481
2,Random Forest,"{'bootstrap': False, 'class_weight': None, 'cr...",0.888889


In [23]:
final = Finalize(RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=21))

In [24]:
final.final_score(X_train, y_train, X_test, y_test)

'Accuracy of the final model is 0.9053254437869822'

In [25]:
final.save_model('random_forest_0_9053.sav')