# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
import tqdm
import joblib
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor():
    def fit(self, X, y=None):
        return self
    
    def transform(self, df): 
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['weekday'] = df['timestamp'].dt.dayofweek
        df = df.drop(['timestamp'],axis=1)
        return df

In [3]:
class MyOneHotEncoder():
    def __init__(self, target):
        self.target = target
        self.encoder = OneHotEncoder(sparse_output=False)

    def fit(self, X, y=None):
        return self

    def transform(self, df):
        y_target = df[self.target]
        
        categorial_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
        encoded_features = self.encoder.fit_transform(df[categorial_features])
        encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(categorial_features))

        df = df.drop(columns=categorial_features + [self.target])
        df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

        return df, y_target

In [4]:
class TrainValidationTest():
    def my_train_test_split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.2, stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=21, test_size=0.2, stratify=y_train)
        
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [5]:
class ModelSelection():
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict

    def choose(self, X_train, y_train, X_valid, y_valid):
        best_accuracy = 0
        best_model_name = ''
        
        for grid_index, model_name in self.grid_dict.items():
            grid_model = self.grids[grid_index]
            grid_model.fit(X_train, y_train)

            best_model = grid_model.best_estimator_
            y_pred = best_model.predict(X_valid)

            current_accuracy = accuracy_score(y_valid, y_pred)

            if current_accuracy > best_accuracy:
                best_accuracy = current_accuracy
                best_model_name = model_name

        return best_model_name

    def best_results(self, X_train, y_train, X_valid, y_valid):
        models_df = pd.DataFrame(columns=['model', 'params', 'valid_score'])

        for grid_index, model_name in self.grid_dict.items():
            print(f'Estimator: {model_name}')
            
            grid_model = self.grids[grid_index]

            with tqdm(total=len(grid_model.param_grid)) as pbar:
                grid_model.fit(X_train, y_train) 
                for _ in grid_model.cv_results_['params']:
                    pbar.update(1)  
            
            best_model = grid_model.best_estimator_

            y_train_pred = best_model.predict(X_train)
            y_valid_pred = best_model.predict(X_valid)
            
            best_params = grid_model.best_params_
            best_train_accuracy = accuracy_score(y_train, y_train_pred)
            best_valid_accuracy = accuracy_score(y_valid, y_valid_pred)

            print(f'Best params: {best_params}')
            print(f'Best training accuracy: {best_train_accuracy:.3f}')
            print(f'Validation set accuracy score for best params: {best_valid_accuracy:.3f}')
            
            new_row = {'model': model_name, 'params': best_params, 'valid_score': best_valid_accuracy}
            
            if new_row and any(new_row.values()):
                models_df = pd.concat([models_df, pd.DataFrame([new_row])], ignore_index=True)


        max_accuracy_ind = models_df['valid_score'].idxmax()
        best_accuracy_model = models_df.loc[max_accuracy_ind, 'model']

        print(f'Classifier with best validation set accuracy: {best_accuracy_model}')

        return models_df

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [22]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator
        self.accuracy = 0

    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)

        self.accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy of the final model is {self.accuracy}')

    def save_model(self, path):
        try:
            joblib.dump(self.estimator, f'{path}.sav')
            print('Model saved successfully')      
        except Exception as e:
            print(f'Error : {e}')

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [7]:
df = pd.read_csv('../data/checker_submits.csv')
df.head()

Unnamed: 0,uid,labname,numTrials,timestamp
0,user_4,project1,1,2020-04-17 05:19:02.744528
1,user_4,project1,2,2020-04-17 05:22:45.549397
2,user_4,project1,3,2020-04-17 05:34:24.422370
3,user_4,project1,4,2020-04-17 05:43:27.773992
4,user_4,project1,5,2020-04-17 05:46:32.275104


In [8]:
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('weekday'))])

In [9]:
data, y = preprocessing.fit_transform(df)
data.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().my_train_test_split(data, y)

In [13]:
svc = SVC(probability=True, random_state=21)
tree = DecisionTreeClassifier(random_state=21)
forest = RandomForestClassifier(random_state=21)

svm_params = {
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
}

tree_params = {
    'max_depth': [1, 5, 10, 20, 40], 
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy']
}

forest_params = {
    'n_estimators': [5, 10, 50, 100], 
    'max_depth': list(range(1, 50)),  
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy']
}

grid_frc = GridSearchCV(forest, param_grid=forest_params, scoring='accuracy', cv=10, n_jobs=-1)
grid_dtc = GridSearchCV(tree, param_grid=tree_params, scoring='accuracy', cv=10)
grid_svm = GridSearchCV(svc, param_grid=svm_params, scoring='accuracy', cv=5, n_jobs=-1)

grids = [grid_frc, grid_dtc, grid_svm]
grid_dict = {
    0: 'SVC',
    1: 'DecisionTreeClassifier',
    2: 'RandomForestClassifier'
}

In [None]:
model_selection = ModelSelection(grids, grid_dict)
model_selection.best_results(X_train, y_train, X_valid, y_valid)

Estimator: SVC


  0%|          | 0/4 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 25, 'n_estimators': 100}
Best training accuracy: 1.000
Validation set accuracy score for best params: 0.904
Estimator: DecisionTreeClassifier


  models_df = pd.concat([models_df, pd.DataFrame([new_row])], ignore_index=True)


  0%|          | 0/3 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 40}
Best training accuracy: 1.000
Validation set accuracy score for best params: 0.867
Estimator: RandomForestClassifier


  0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'C': 10}
Best training accuracy: 0.431
Validation set accuracy score for best params: 0.389
Classifier with best validation set accuracy: SVC


Unnamed: 0,model,params,valid_score
0,SVC,"{'class_weight': 'balanced', 'criterion': 'gin...",0.903704
1,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.866667
2,RandomForestClassifier,{'C': 10},0.388889


In [23]:
best_model = RandomForestClassifier(random_state=21, class_weight='balanced', criterion='gini', max_depth=25, n_estimators=100)

final = Finalize(best_model)
final.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final model is 0.9142011834319527


In [24]:
final.save_model('best_my_model')

Model saved successfully
