# IAU projekt

> Rovnakým podieľom práce vypracovali: <br>
> Roman Bitarovský, Emma Macháčová

In [1]:
pause

NameError: name 'pause' is not defined

## Table of contents <a name="obsah"></a>
* [Zadanie](#zadanie)
    * [Slovníček](#slovnicek)
   
* [Data init (Fáza 1)](#dataInit)
* [Fáza 2](#faza2)
    * [2.1. Integrácia a čistenie dát](#2.1.)
        * [2.1.1. Replacing NaNs](#2.1.1.)
            * [2.1.1.1. Replacing NaNs - Method 1: Drop nans](#2.1.1.a)
            * [2.1.1.2. Replacing NaNs - Method 2: Replace with Mean](#2.1.1.b)
            * [2.1.1.3. Replacing NaNs - Method 3: Replace with Median](#2.1.1.c)
            * [2.1.1.4. Replacing NaNs - Method 4: Replace with kNN](#2.1.1.d)
        * [2.1.2. Deleting Outliers Values](#2.1.2.)
    * [2.2. Realizácia predspracovania dát](#2.2.)  
        * [2.2.1. Transforovanie a škálovanie dát](#2.2.1.)
        * [2.2.2. Rozdelenie dát](#2.2.2.)
        * [2.2.3. Zhodnotenie ](#2.2.3.)
    * [2.3. Výber atribútov pre strojové učenie](#2.3.)  
        * [2.3.1. Variance Threshold ](#2.3.1.)
        * [2.3.2. SelectKBest](#2.3.2.)
        * [2.3.3 SelectPercentile](#2.3.3.)
        * [2.3.4. Záver výberov](#2.3.4.)
    * [2.4. Replikovateľnosť predspracovania](#2.4.)  
        * [2.4.1. Code improvements](#2.4.1.)
        * [2.4.2. Pipeline](#2.4.2.)

# Zadanie <a name="zadanie"></a>
Znečistenie ovzdušia spôsobuje vážne dýchacie a srdcové ochorenia, ktoré môžu byť smrteľné. Najčastejšie sú postihnuté deti, čo vedie k zápalu pľúc a problémom s dýchaním vrátane astmy. Kyslé dažde, ničenie ozónovej vrstvy a globálne otepľovanie sú niektoré z nepriaznivých dôsledkov. Dátová sada pre Vás (World's Air Pollution: Real-time Air Quality Index https://waqi.info/) predstavuje záznamy jednotlivých meraní kvality ovzdušia ako kombinácia mnohých faktorov bez časovej následnosti. V záznamoch je závislá premenná s menom “warning” indikujúca alarmujúci stav kvality ovzdušia. Vo veľkých mestách ako napr. Peking (angl. Beijing, hlavné mesto Číny s viac ako 21 miliónov ľudí) sa pri varovaní spustí opatrenie ako obmedzenie pohybov áut a ľudí v meste alebo umelý dážď až pokiaľ kvalita vzduchu sa nevráti do normu.

* Úlohou je predikovať závislé hodnoty premennej “warning” pomocou metód strojového učenia.
* Pritom sa treba vysporiadať s viacerými problémami, ktoré sa v dátach nachádzajú ako formáty dát, chýbajúce, vychýlené hodnoty a pod.

## Slovníček  <a name="slovnicek"></a>
<details>
    <summary>Zobraziť</summary>
    
    PM2.5 - Particulate Matter (µg/m3) 
    PM10 - Particulate Matter (µg/m3) 
    NOx - Nitrogen Oxides (µg/m3)
    NO2 - Nitrogen Dioxide (µg/m3)
    SO2 - Sulfur Dioxide  (µg/m3)
    CO - Carbon Monoxide emissions  (µg/m3)
    CO2 - Carbon Dioxide  (µg/m3)
    PAHs - Polycyclic Aromatic Hydrocarbons  (µg/m3)
    NH3 - Ammonia trace  (µg/m3)
    Pb - Lead  (µg/m3)
    TEMP - Temperature (degree Celsius)
    DEWP - Dew point temperature (degree Celsius)
    PRES - Pressure (hPa, <100, 1050>)
    RAIN - Rain (mm)
    WSPM - Wind Speed (m/s)
    WD - Wind Direction
    VOC - Volatile Organic Compounds
    CFCs - Chlorofluorocarbons
    C2H3NO5 - Peroxyacetyl nitrate
    H2CO - Plywood emit formaldehyde
    GSTM1 - Glutathione-S transferase M1
    1-OHP - 1-hydroxypyrene
    2-OHF - 2-hydroxyfluorene
    2-OHNa - 2-hydroxynaphthalene
    N2 - Nitrogen
    O2 - Oxygen
    O3 - Ozone
    Ar - Argon
    Ne - Neon
    CH4 - Methane
    He - Helium
    Kr - Krypton
    I2 - Iodine
    H2 - Hydrogen
    Xe - Xenon
</details>

# Data init <a name="dataInit"></a>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats as sm_stats
from scipy.stats import mannwhitneyu
from scipy.stats import f_oneway

import datetime
import re
import category_encoders as ce
from sklearn.impute import SimpleImputer, KNNImputer
from numpy import percentile

from sklearn.preprocessing import PowerTransformer, QuantileTransformer

from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel
from sklearn.feature_selection import mutual_info_regression, chi2, f_regression, f_classif
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
labor_measurements = pd.read_csv('../081/measurements.csv', sep='\t')
labor_stations = pd.read_csv('../081/stations.csv', sep='\t')

In [None]:
labor_stations["QoS"].replace({"acceptable": "accep", "maitennce": "maintenance"}, inplace=True)
labor_stations['revision'] = pd.to_datetime(labor_stations['revision'], utc=False)

labor_measurements.replace('', np.nan, inplace=True)
labor_stations.replace('', np.nan, inplace=True)

labor_measurements = labor_measurements.drop_duplicates()
labor_stations = labor_stations.drop_duplicates()

# merge preprocesing
labor_stations = labor_stations.drop(columns=['revision', 'code', 'QoS'])
labor_stations = labor_stations.drop_duplicates()

# Table merge
df = pd.merge(labor_measurements, labor_stations, how='inner', left_on=['latitude', 'longitude'], right_on=['latitude', 'longitude'])

df = df.drop(columns=['latitude', 'longitude'])
df = df[['location', 'warning', 'TEMP', 'PRES', 'PM2.5', 'NOx', 'PM10', 'C2H3NO5', 'CH4', 'Pb', 'NH3', 'SO2', 'O3', 'CO', 'PAHs', 'H2CO', 'CFCs']]

df.head()

In [None]:
df.info()

# Fáza 2 Predspracovanie údajov <a name="faza2"></a> 
 
# 2.1 Integrácia a čistenie dát (5b) <a name="2.1."></a>
Transformujte dáta na vhodný formát pre strojové učenie t.j. jedno pozorovanie musí byť opísané jedným riadkom a každý atribút musí byť v numerickom formáte. 
* Pri riešení chýbajúcich hodnôt (missing values) vyskúšajte rôzne stratégie ako napr.
    * odstránenie pozorovaní s chýbajúcimi údajmi
    * nahradenie chýbajúcej hodnoty mediánom, priemerom, pomerom (ku korelovanému atribútu), alebo pomocou lineárnej regresie resp. kNN
* Podobne postupujte aj pri riešení vychýlených hodnôt (outlier detection):
    * odstránenie vychýlených (odľahlých) pozorovaní
    * nahradenie vychýlenej hodnoty hraničnými hodnotami rozdelenia (5% resp. 95%)
  
<b>Go to:</b>  
* [Naspäť na Obsah](#obsah)
* [Replacing NaNs - Method 1: Drop nans](#2.1.1.a)
* [Replacing NaNs - Method 2: Replace with Mean](#2.1.1.b)
* [Replacing NaNs - Method 3: Replace with Median](#2.1.1.c)
* [Replacing NaNs - Method 4: Replace with kNN](#2.1.1.d)
* [Deleting Outliers Values](#2.1.2.)

In [None]:
df_not_changed = df.copy()

## 2.1.1. Replacing NaNs <a name="2.1.1."></a>

In [None]:
def replaceNaN(df_original, strategy):
    df = df_original.copy()

    na_cols = df.columns[df.isnull().any()].tolist()
    
    if strategy == 'kNN':
        imp_strategy = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

    elif strategy == 'mean' or strategy == 'median':    
        imp_strategy = SimpleImputer(missing_values=np.nan, strategy=strategy)
        
    elif strategy == 'drop_na':
        df = df.dropna()
        
    if strategy != 'drop_na':
        for col in na_cols:
            df[col] = imp_strategy.fit_transform(df[[col]])
        
    print(df.isnull().sum())
    
    fig, ax = plt.subplots(figsize=(16,8))
    corr_diff = df.corr() - df_original.corr()
    sns.heatmap(corr_diff[abs(corr_diff) > 0.000099], ax=ax, annot=True, fmt=".4f")
    
    return df

In [None]:
# prekodovanie textu locacie n číslo 
ce_ordinal = ce.OrdinalEncoder(cols=['location'])
df_ml = ce_ordinal.fit_transform(df)
df_ml

### Replacing NaNs - Method 1: Drop nans <a name="2.1.1.a"></a>

In [None]:
df_ml_dropedNa = replaceNaN(df_ml, 'drop_na')

### Replacing NaNs - Method 2: Replace with Mean <a name="2.1.1.b"></a>

In [None]:
df_ml_na_mean = replaceNaN(df_ml, 'mean')

### Replacing NaNs - Method 3: Replace with Median <a name="2.1.1.c"></a>

In [None]:
df_ml_na_median = replaceNaN(df_ml, 'median')

### Replacing NaNs - Method 4: Replace with kNN <a name="2.1.1.d"></a>

In [None]:
df_ml_na_knn = replaceNaN(df_ml, 'kNN')

## 2.1.2. Deleting Outliers Values <a name="2.1.2."></a>

In [None]:
df_ml_na_knn.plot(kind='box', subplots=True, layout=(7, 3), sharex=False, sharey=False, figsize=(20, 60))

In [None]:
outliers_limit_low = 0
outliers_limit_up = 0
def identify_outliers(df):
    
    Qa = df.quantile(0.05)
    Qb = df.quantile(0.95)
    IQR = Qb - Qa
    
    global outliers_limit_low
    outliers_limit_low = (Qa - 1.5 * IQR)
    global outliers_limit_up 
    outliers_limit_up = (Qb + 1.5 * IQR)
    
    return ((df < outliers_limit_low) | (df > outliers_limit_up)).sum()

In [None]:
identify_outliers(df_ml_na_knn)

In [None]:
outliers_limit_low

In [None]:
outliers_limit_up

In [None]:
def replace_outliers(df_original):
    df = df_original.copy()
    
    for col in df.columns:

        low = outliers_limit_low[col]
        up = outliers_limit_up[col]
        df[col] = np.where(df[col] < low, low, df[col])
        df[col] = np.where(df[col] > up, up, df[col])
        
    return df

In [None]:
df_ml_na_knn_notOutliers = replace_outliers(df_ml_na_knn)

In [None]:
identify_outliers(df_ml_na_knn_notOutliers)

In [None]:
df_ml_na_knn.plot(kind='box', subplots=True, layout=(1, 17), sharex=False, sharey=False, figsize=(20, 5))
plt.title('Pred odstránením outlierov')
df_ml_na_knn_notOutliers.plot(kind='box', subplots=True, layout=(1, 17), sharex=False, sharey=False, figsize=(20, 5))
plt.title('Po odstránení outlierov')

# 2.2. Realizácia predspracovania dát (5b). <a name="2.2."></a>
* Transformované dáta pre strojové učenie si rozdeľuje na trénovaciu a testovaciu množinu podľa vami preddefinovaným pomerom. Naďalej pracujte len s trénovacím datasetom.
* Transformujte atribútov dát pre strojové učenie podľa dostupných techník (minimálne 2 techniky) ako scaling, transformers a ďalšie.
* Zdôvodnite Vašu voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)

<b>Go to:</b>  
* [Naspäť na Obsah](#obsah)
* [Transforovanie a škálovanie dát](#2.2.1.)
* [Rozdelenie dát](#2.2.2.)
* [Zhodnotenie ](#2.2.3.)

In [None]:
df = df_ml_na_knn_notOutliers.copy()

## 2.2.1. Transforovanie a škálovanie dát <a name="2.2.1."></a>

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=True)
quan = QuantileTransformer(n_quantiles=20, random_state=0)
stan_s = StandardScaler()
norm_s = MinMaxScaler()

m_col = [
    'TEMP', 'TEMP', 
    'PRES', 'PRES',
    'PM2.5', 'PM2.5', 
    'NOx', 'NOx', 
    'PM10','PM10', 
    'C2H3NO5', 'C2H3NO5', 
    'CH4', 'CH4',
    'Pb', 'Pb',          
    'NH3', 'NH3', 
    'SO2', 'SO2',
    'O3', 'O3',
    'CO', 'CO',
    'PAHs', 'PAHs', 
    'H2CO', 'H2CO',
    'CFCs', 'CFCs'
]

fig, axs = plt.subplots(2, 2, figsize=(10,120))

for num, col_name in enumerate(m_col):
    
    plt.subplot(16, 2, num+1)
    plt.hist(df[col_name], bins=25)
    
    if num % 2 == 0:
        plt.title(col_name + ' - Transformacia')
        pow_trans = power.fit_transform(df[[col_name]])
        plt.hist(pow_trans, bins=25)
        q_trans = quan.fit_transform(df[[col_name]])

        plt.hist(q_trans, bins=25)
        plt.grid()
        plt.legend(['origin', 'power_t', 'quan_t'])

    
    else:
        plt.title(col_name + ' - Skalovanie')
        s_scaled = stan_s.fit_transform(df[[col_name]])
        plt.hist(s_scaled, bins=25)
        n_scaled = norm_s.fit_transform(df[[col_name]])

        plt.hist(n_scaled, bins=25)
        plt.grid()
        plt.legend(['origin', 'stan_s', 'norm_s'])

yeo-johnson sme vybrali preto lebo Box-cox nepodporuje transformáciu záporných hodnôt

In [None]:
# @TODO napísať vyhodnotenie pre tieto obrázky

## 2.2.2. Rozdelenie dát <a name="2.2.2."></a>

Dataset rozdelíme v pomere 1/3 pre testovaciu množinu a 2/3 pre trénovaciu množinu 
pre indikátor (y) a pre všetky ostatné atribúty (X).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['warning'], axis=1), df['warning'], test_size=0.33)

## 2.2.3. Zhodnotenie <a name="2.2.3."></a>

@TODO

# 2.3. Výber atribútov pre strojové učenie (5b) <a name="2.3."></a>
* Zistite ktoré atribúty (features) vo vašich dátach pre strojové učenie sú informatívne k atribútu “warning”. Zoradíte tie atribúty v poradí podľa dôležitosti. 
* Zdôvodnite Vašu voľby/rozhodnutie pre realizáciu (t.j. zdokumentovanie)

<b>Go to:</b>  
* [Naspäť na Obsah](#obsah)
* [Variance Threshold ](#2.3.1.)
* [SelectKBest](#2.3.2.)
* [SelectPercentile](#2.3.3.)
* [Záver výberov](#2.3.4.)

## 2.3.1. Variance Threshold <a name="2.3.1."></a>

In [None]:
X = X_train.copy()
y = y_train.copy()
print(X.columns)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
colsVT = sel.fit_transform(X)

In [None]:
print('Pôvodný počet stĺpcov: ', X.shape[1])
if (X.shape[1] == colsVT[0].size):
    print('Všetky dáta sú užitočné')
elif (colsVT[0].size < X.shape[1]):
    print('Máme aj neužitočné dáta')

In [None]:
# -------------------

In [None]:
def orderColumns(X, y, selectionType, selectionTypeCategory):
    
    if (selectionTypeCategory == 'SelectKBest'):
        selector = SelectKBest(selectionType, k ='all')
    
    elif (selectionTypeCategory == 'SelectPercentile'):
        selector = SelectPercentile(selectionType, percentile=100)
    
    X_selected = selector.fit_transform(X, y)
    
    scores = selector.scores_
    
    col_names = X.columns[selector.get_support()]
    
    indices = []
    
    for _, x in sorted(zip(scores, col_names), reverse=True):
        indices.append(x)
    
    
    return indices

In [None]:
list_of_list = []

## 2.3.2. SelectKBest <a name="2.3.2."></a>

In [None]:
orders_funcs_list = [mutual_info_regression, f_regression]

In [None]:
for i in orders_funcs_list:
    temp = orderColumns(X, y, i, 'SelectKBest')
    list_of_list.append(temp)
    print(temp)

## 2.3.3 SelectPercentile <a name="2.3.3."></a>

In [None]:
orders_funcs_list = [f_classif, f_regression]

In [None]:
for i in orders_funcs_list:
    temp = orderColumns(X, y, i, 'SelectPercentile')
    list_of_list.append(temp)
    print(temp)

### 2.3.4. Záver výberov <a name="2.3.4."></a>

In [None]:
def orderLists(X, list_of_list):
    
    x_columns = X.columns
    x_weights = []
    
    for i in range(len(x_columns)):
        x_weights.append(i)
    
    for lst in list_of_list:
        for i in range(len(x_columns)):
            x_weights[i] += lst.index(x_columns[i])  
            
    map_of_cols = []
    for _, q in sorted(zip(x_weights, x_columns)):
        map_of_cols.append(q)
    
    return map_of_cols 

In [None]:
orderLists(X, list_of_list)

#### Záver výberu atribútov pre strojové učenie

text

# 2.4. Replikovateľnosť predspracovania (5b) <a name="2.4."></a>
* Upravte váš kód realizujúci predspracovanie trénovacej množiny tak, aby ho bolo možné bez ďalších úprav znovupoužiť na predspracovanie testovacej množiny (pomocou funkcie/í)
* Očakáva sa aj využitie možnosti sklearn.pipeline

<b>Go to:</b>  
* [Naspäť na Obsah](#obsah)
* [Code improvements](#2.4.1.)
* [Pipeline](#2.4.2.)

## 2.4.1. Code improvements <a name="2.4.1."></a>

### Utils

In [None]:
def count_columns(df):
    return df.columns[df.isnull().any()].tolist()

In [None]:
def draw(df):
    fig, ax = plt.subplots(figsize=(16,8))
    corr_diff = df.corr() - df_original.corr()
    sns.heatmap(corr_diff[abs(corr_diff) > 0.000099], ax=ax, annot=True, fmt=".4f")
    pass

In [None]:
def df_columns(df):
    new_cols = []
    
    for col in df.columns:
        if col not in ['location', 'warning']:
            new_cols.append(col)
        
    print(new_cols)
    return new_cols

### Handle NaNs

In [None]:
class HandleNaNs_drop(TransformerMixin):
    
    def __init__(self):
        pass
    
    def replaceNaN(self, df):
        df = df.dropna()

        print(df.isnull().sum())

        return df
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.replaceNaN(X)

In [None]:
class HandleNaNs_mean(TransformerMixin):
    
    def __init__(self):
        pass
    
    def replaceNaN(self, df):
        na_cols = count_columns(df)
        imp_strategy = SimpleImputer(missing_values=np.nan, strategy='mean')
        
        for col in na_cols:
            df[col] = imp_strategy.fit_transform(df[[col]])

        print(df.isnull().sum())

        return df
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.replaceNaN(X)

In [None]:
class HandleNaNs_median(TransformerMixin):
    
    def __init__(self):
        pass
    
    def replaceNaN(self, df):
        na_cols = count_columns(df)
        imp_strategy = SimpleImputer(missing_values=np.nan, strategy='median')
        
        for col in na_cols:
            df[col] = imp_strategy.fit_transform(df[[col]])

        print(df.isnull().sum())

        return df
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.replaceNaN(X)

In [None]:
class HandleNaNs_knn(TransformerMixin):
    
    def __init__(self):
        pass
    
    def replaceNaN(self, df):
        na_cols = count_columns(df)
        imp_strategy = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
        
        for col in na_cols:
            df[col] = imp_strategy.fit_transform(df[[col]])

        print(df.isnull().sum())

        return df
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.replaceNaN(X)

### Handle non numeric atributes

In [None]:
class HandleLocation(TransformerMixin):
    
    def __init__(self):
        pass

    def encodeLocation(self, df):
        # prekodovanie textu locacie n číslo 
        ce_ordinal = ce.OrdinalEncoder(cols=['location'])
        return ce_ordinal.fit_transform(df)
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.encodeLocation(X)

### Handle Outliers

In [None]:
class HandleOutliers_drop(TransformerMixin):
    
    outliers_limit_low = 0
    outliers_limit_up = 0
    
    def __init__(self):
        pass
         
    def handleOutliers(self, df):
        
        for col in df_columns(df):
            
            q_low = percentile(df[col], 25), 
            q_up = percentile(df[col], 75)

            offset = (q_up - q_low) * 1.5

            limit_low = q_low - offset,
            limit_up = q_up + offset
            
        return df[((df[col] >= limit_low) & (df[col] <= limit_up))] 
         
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.handleOutliers(X)

In [None]:
class HandleOutliers_replace(TransformerMixin):
    
    outliers_limit_low = 0
    outliers_limit_up = 0
    
    def __init__(self):
        pass
       
    def handleOutliers(self, df):
        
        for col in df_columns(df):  
            
            q05 = percentile(df[col], 5)
            q95 = percentile(df[col], 95)

            df[col] = np.where(df[col] < q05, q05, df[col])
            df[col] = np.where(df[col] > q95, q95, df[col])
            
        return df
         
    def fit(self, X):
        return self
    
    def transform(self, X):
        return self.handleOutliers(X)

### Handle Transformations

In [None]:
class HandleTransformations_power(TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        df = X
        power = PowerTransformer(method='yeo-johnson', standardize=True)
        new_df = pd.DataFrame(power.fit_transform(df), columns = df.columns)
        new_df['location'] = df['location']
        new_df['warning'] = df['warning']
        return new_df

In [None]:
class HandleTransformations_quant(TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        df = X
        quan = QuantileTransformer(n_quantiles=10, random_state=0)
        new_df = pd.DataFrame(quan.fit_transform(df), columns = df.columns)
        new_df['location'] = df['location']
        new_df['warning'] = df['warning']
        return new_df

In [None]:
class HandleTransformations_scaleMM(TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        df = X
        norm_s = MinMaxScaler()
        new_df = pd.DataFrame(norm_s.fit_transform(df), columns = df.columns)
        new_df['location'] = df['location']
        new_df['warning'] = df['warning']
        return new_df

In [None]:
class HandleTransformations_scaleS(TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        df = X
        stan_s = StandardScaler()
        new_df = pd.DataFrame(stan_s.fit_transform(df), columns = df.columns)
        new_df['location'] = df['location']
        new_df['warning'] = df['warning']
        return new_df

### Split train and test

In [None]:
class Split(TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X):
        return self
    
    def transform(self, X):  
        df = X
        X_train, X_test, y_train, y_test = train_test_split(df.drop(['warning'], axis=1), df['warning'], test_size=0.33)
        return X_train, X_test, y_train, y_test

### Handle Selection

In [None]:
class VarianceThreshold_do(TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        df = X.copy()

        sel = VarianceThreshold(.8 * (1 - .8))
        colsVT = sel.fit_transform(df)
                        
        if (df.shape[1] == colsVT[0].size):
            print('VarianceThreshold: Všetky dáta sú užitočné')
            
        elif (colsVT[0].size < df.shape[1]):
            print('VarianceThreshold: Máme aj neužitočné dáta')
        
        return X


In [None]:
class Selection_KBest_mutual_info_regression(TransformerMixin):
    
    def __init__(self):
        pass
    
    def orderColumns(self, tuple_of_df):
        X_train, X_test, y_train, y_test = tuple_of_df[0], tuple_of_df[1], tuple_of_df[2], tuple_of_df[3]
                
        selector = SelectKBest(mutual_info_regression, k='all')
        
        selected = selector.fit_transform(X_train, y_train)
        
        scores = selector.scores_
        
        col_names = X_train.columns[selector.get_support()]

        indices = []
        for _, x in sorted(zip(scores, col_names), reverse=True):
            indices.append(x)

        X_train.columns = indices
        
        return X_train, X_test, y_train, y_test
        
    def fit(self, tuple_of_df):
        return self
    
    def transform(self, tuple_of_df):
        return self.orderColumns(tuple_of_df)


In [None]:
class Selection_KBest_f_regression(TransformerMixin):
    
    def __init__(self):
        pass
    
    def orderColumns(self, tuple_of_df):
        X_train, X_test, y_train, y_test = tuple_of_df[0], tuple_of_df[1], tuple_of_df[2], tuple_of_df[3]
                
        selector = SelectKBest(f_regression, k='all')
        
        selected = selector.fit_transform(X_train, y_train)
        
        scores = selector.scores_
        
        col_names = X_train.columns[selector.get_support()]

        indices = []
        for _, x in sorted(zip(scores, col_names), reverse=True):
            indices.append(x)

        X_train.columns = indices
        
        return X_train, X_test, y_train, y_test
        
    def fit(self, tuple_of_df):
        return self
    
    def transform(self, tuple_of_df):
        return self.orderColumns(tuple_of_df)


In [None]:
class Selection_Percentile_f_classif(TransformerMixin):
    
    def __init__(self):
        pass
    
    def orderColumns(self, tuple_of_df):
        X_train, X_test, y_train, y_test = tuple_of_df[0], tuple_of_df[1], tuple_of_df[2], tuple_of_df[3]
                
        selector = SelectPercentile(f_classif, percentile=100)
        
        selected = selector.fit_transform(X_train, y_train)
        
        scores = selector.scores_
        
        col_names = X_train.columns[selector.get_support()]

        indices = []
        for _, x in sorted(zip(scores, col_names), reverse=True):
            indices.append(x)

        X_train.columns = indices
        
        return X_train, X_test, y_train, y_test
        
    def fit(self, tuple_of_df):
        return self
    
    def transform(self, tuple_of_df):
        return self.orderColumns(tuple_of_df)


In [None]:
class Selection_Percentile_f_regression(TransformerMixin):
    
    def __init__(self):
        pass
    
    def orderColumns(self, tuple_of_df):
        X_train, X_test, y_train, y_test = tuple_of_df[0], tuple_of_df[1], tuple_of_df[2], tuple_of_df[3]
                
        selector = SelectPercentile(f_regression, percentile=100)
        
        selected = selector.fit_transform(X_train, y_train)
        
        scores = selector.scores_
        
        col_names = X_train.columns[selector.get_support()]

        indices = []
        for _, x in sorted(zip(scores, col_names), reverse=True):
            indices.append(x)

        X_train.columns = indices
        
        return X_train, X_test, y_train, y_test
        
    def fit(self, tuple_of_df):
        return self
    
    def transform(self, tuple_of_df):
        return self.orderColumns(tuple_of_df)


## 2.4.2. Pipeline <a name="2.4.2."></a>

In [None]:
def pipelineGenerator():
    
    pipeline =  Pipeline([
        ('HandleNaNs', HandleNaNs_knn()),
        ('HandleLocation', HandleLocation()),
        ('HandleOutliers', HandleOutliers_replace()),
        ('HandleTransformations', HandleTransformations_power()),
        ('HandleSelection', VarianceThreshold_do()),
        ('Split', Split()),
        ('handleSelection2', Selection_Percentile_f_regression()),
        
    ])
    return pipeline

In [None]:
pipeline1 = pipelineGenerator()
X_train, X_test, y_train, y_test = pipeline1.fit_transform(df_not_changed)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test