In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Initial inspection & data cleaning**

In [26]:
import matplotlib.pyplot as plt

In [27]:
# finding and imputing NaN values
missing_values = ['NaN','--','na', 'N/A', 'ND']
df = pd.read_csv('/kaggle/input/forex-rates/Foreign_Exchange_Rates.csv', na_values = missing_values)
df = df.iloc[: , 1:] # dropping the first two colums (useless information)
df

In [28]:
df.dtypes

In [29]:
df.isnull().sum()

In [30]:
df.set_index('Time Serie').plot().legend(loc='center left',bbox_to_anchor=(1.0, 0.5))

In [31]:
# replacing the NaN values with mean
index_names = list(df)[1:]
for index in index_names :
    df[index].fillna(df[index].mean(), inplace = True)

In [32]:
df

**Outlier detection & data visualisation**

In [33]:
import seaborn as sns

In [37]:
index_names = list(df)[1:]
for index in index_names : 
    # boxplot for data on exchange rate
    df.set_index('Time Serie').boxplot(column = [index]).set(xticklabels=[])

    # inter-quartile range calculation
    q1 = df[index].quantile(0.25)
    q3 = df[index].quantile(0.75)
    iqr = q3 - q1

    # percentage of outliers for index column
    p_out = (((df[index] < (q1 - 1.5 * iqr)) | (df[index] > (q3 + 1.5 * iqr))).sum() / len(df)) * 100
    print('Percentage of outliers in '+ index + ' FOREX data : ', p_out, ' %')

**Normalization**

In [39]:
index_names = list(df)[1:]
for index in index_names : 
    sns.kdeplot(df[index]).set_xlim([0,25])

In [36]:
# normalization
index_names = list(df)[1:]
for index in index_names : 
    value = df[index]
    normal = (value - value.min())/(value.max() - value.min())
    sns.kdeplot(normal).set_xlim([0,1.3])

In [41]:
# using zscores
index_names = list(df)[1:]
for index in index_names : 
    value = df[index]
    zscore = (value - value.mean())/value.std()
    sns.kdeplot(zscore).set_xlim([0,5])