# IAU projekt

> Rovnakým podieľom práce vypracovali: <br>
> Roman Bitarovský, Emma Macháčová

# Fáza 2 Predspracovanie údajov <a name="faza2"></a>
## 2.1 Integrácia a čistenie dát (5b) <a name=""></a>
Transformujte dáta na vhodný formát pre strojové učenie t.j. jedno pozorovanie musí byť opísané jedným riadkom a každý atribút musí byť v numerickom formáte. 
* Pri riešení chýbajúcich hodnôt (missing values) vyskúšajte rôzne stratégie ako napr.
    * odstránenie pozorovaní s chýbajúcimi údajmi
    * nahradenie chýbajúcej hodnoty mediánom, priemerom, pomerom (ku korelovanému atribútu), alebo pomocou lineárnej regresie resp. kNN
* Podobne postupujte aj pri riešení vychýlených hodnôt (outlier detection):
    * odstránenie vychýlených (odľahlých) pozorovaní
    * nahradenie vychýlenej hodnoty hraničnými hodnotami rozdelenia (5% resp. 95%)

In [1]:
pause

NameError: name 'pause' is not defined

In [None]:
# čo sa myslí tou transformáciou dát na jeden riadok ? 

# Data init

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats as sm_stats
from scipy.stats import mannwhitneyu
from scipy.stats import f_oneway

import datetime
import re
import category_encoders as ce
from sklearn.impute import SimpleImputer, KNNImputer
from numpy import percentile

from sklearn.preprocessing import PowerTransformer, QuantileTransformer

from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel
from sklearn.feature_selection import mutual_info_regression, chi2, f_regression, f_classif
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
labor_measurements = pd.read_csv('../081/measurements.csv', sep='\t')
labor_stations = pd.read_csv('../081/stations.csv', sep='\t')

In [None]:
labor_stations["QoS"].replace({"acceptable": "accep", "maitennce": "maintenance"}, inplace=True)
labor_stations['revision'] = pd.to_datetime(labor_stations['revision'], utc=False)

labor_measurements.replace('', np.nan, inplace=True)
labor_stations.replace('', np.nan, inplace=True)

labor_measurements = labor_measurements.drop_duplicates()
labor_stations = labor_stations.drop_duplicates()

# merge preprocesing
labor_stations = labor_stations.drop(columns=['revision', 'code', 'QoS'])
labor_stations = labor_stations.drop_duplicates()

# Table merge
df = pd.merge(labor_measurements, labor_stations, how='inner', left_on=['latitude', 'longitude'], right_on=['latitude', 'longitude'])

df = df.drop(columns=['latitude', 'longitude'])
df = df[['location', 'warning', 'TEMP', 'PRES', 'PM2.5', 'NOx', 'PM10', 'C2H3NO5', 'CH4', 'Pb', 'NH3', 'SO2', 'O3', 'CO', 'PAHs', 'H2CO', 'CFCs']]

df.head()

In [None]:
df.info()

## Replacing NaNs

In [None]:
def replaceNaN(df_original, strategy):
    df = df_original.copy()

    na_cols = df.columns[df.isnull().any()].tolist()
    
    if strategy == 'kNN':
        imp_strategy = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

    elif strategy == 'mean' or strategy == 'median':    
        imp_strategy = SimpleImputer(missing_values=np.nan, strategy=strategy)
        
    elif strategy == 'drop_na':
        df = df.dropna()
        
    if strategy != 'drop_na':
        for col in na_cols:
            df[col] = imp_strategy.fit_transform(df[[col]])
        
    print(df.isnull().sum())
    
    fig, ax = plt.subplots(figsize=(16,8))
    corr_diff = df.corr() - df_original.corr()
    sns.heatmap(corr_diff[abs(corr_diff) > 0.000099], ax=ax, annot=True, fmt=".4f")
    
    return df

In [None]:
# prekodovanie textu locacie n číslo 
ce_ordinal = ce.OrdinalEncoder(cols=['location'])
df_ml = ce_ordinal.fit_transform(df)

### Drop nans

In [None]:
df_ml_dropedNa = replaceNaN(df_ml, 'drop_na')

### Mean

In [None]:
df_ml_na_mean = replaceNaN(df_ml, 'mean')

### Median

In [None]:
df_ml_na_median = replaceNaN(df_ml, 'median')

### kNN

In [None]:
df_ml_na_knn = replaceNaN(df_ml, 'kNN')

## Deleting outliers vlaues

In [None]:
df_ml_na_knn.plot(kind='box', subplots=True, layout=(7, 3), sharex=False, sharey=False, figsize=(20, 60))


In [None]:
outliers_limit_low = 0
outliers_limit_up = 0
def identify_outliers(df):
    
    Qa = df.quantile(0.05)
    Qb = df.quantile(0.95)
    IQR = Qb - Qa
    
    global outliers_limit_low
    outliers_limit_low = (Qa - 1.5 * IQR)
    global outliers_limit_up 
    outliers_limit_up = (Qb + 1.5 * IQR)
    
    return ((df < outliers_limit_low) | (df > outliers_limit_up)).sum()

In [None]:
identify_outliers(df_ml_na_knn)

In [None]:
outliers_limit_low

In [None]:
outliers_limit_up

In [None]:
def replace_outliers(df_original):
    df = df_original.copy()
    
    for col in df.columns:

        low = outliers_limit_low[col]
        up = outliers_limit_up[col]
        df[col] = np.where(df[col] < low, low, df[col])
        df[col] = np.where(df[col] > up, up, df[col])
        
    return df

In [None]:
df_ml_na_knn_notOutliers = replace_outliers(df_ml_na_knn)

In [None]:
identify_outliers(df_ml_na_knn_notOutliers)