# Importing modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import cluster, datasets
from tabulate import tabulate
from prettytable import PrettyTable
import random
from scipy import stats
%matplotlib inline

## Simple scatterplot

Construindo dados

In [None]:
random_n1 = list(np.random.normal(18, 10, 100))
random_n2 = list(np.random.normal(33, 10, 100))
random_n3 = list(np.random.normal(50, 10, 100))

In [None]:
x = []
y = []
n_samples = 1000
for n in range(0,100):
    for _ in range(n_samples):
        if n < 20:
            x.append(n + random.sample(random_n1,1)[0])
            y.append(n + random.sample(random_n1,1)[0])
        elif n < 40:
            x.append(n + random.sample(random_n2,1)[0])
            y.append(n + random.sample(random_n2,1)[0])
        elif n < 60:
            x.append(n + random.sample(random_n3,1)[0])
            y.append(n + random.sample(random_n3,1)[0])
        elif n < 80:
            x.append(n + random.sample(random_n2,1)[0])
            y.append(n + random.sample(random_n2,1)[0])
        else:
            x.append(n + random.sample(random_n1,1)[0])
            y.append(n + random.sample(random_n1,1)[0])

### Plot simples

In [None]:
f, ax = plt.subplots(figsize = (8,8))
sns.scatterplot(y, x, alpha = 1,linewidth=0.1)

### Há algum padrão?

In [None]:
sns.jointplot(y, x, alpha = 1)

### Há três picos no histograma

In [None]:
sns.jointplot(y, x, alpha = 0.002)

### Uma mudança sútil na visualização permite identificar padrões antes escondidos

## Como identificar padrões?

In [None]:
n_samples = 10000
X,y  = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)

In [None]:
data = pd.DataFrame(X, columns = ['x', 'y'])
data['target'] = y

In [None]:
f, ax = plt.subplots(figsize = (8,8))
sns.scatterplot(x = 'x', y = 'y',data = data, alpha = 1)

In [None]:
sns.jointplot(x = 'x', y = 'y',data = data, s = 10, alpha = 0.3)

### Como separar os dados em dois grupos?

### Quais métodos?

### As vezes não precisamos utilizar métodos complexos e custosos computacionalmente

#### Compreender o problema é o primeiro passo para uma boa análise.

In [None]:
# distância

In [None]:
data['dist'] = np.sqrt(data['x']**2 + data['y']**2)

In [None]:
f, ax = plt.subplots(figsize = (15, 8))
sns.kdeplot(data['dist'])

In [None]:
# label

In [None]:
data['color'] = 'red'
data.loc[data['dist'] <= data['dist'].mean(), 'color'] = 'blue'

In [None]:
f, ax = plt.subplots(figsize = (8,8))
sns.scatterplot(x = 'x', y = 'y',data = data, s = 10, alpha = 0.5, hue = 'color',legend=False)

In [None]:
data['yhat'] = data['dist'].map(lambda x: 1 if x < data['dist'].mean() else 0,1)

In [None]:
def acc(target, yhat):
    total = 0
    for n in range(len(target)):
        if target[n] == yhat[n]:
            total +=1
        else:
            pass
    return (total/len(target))*100

In [None]:
acc(data['yhat'], data['target'])

### Com uma distância euclidiana conseguimos 100% de acurácia!

## Iris dataset

### Load data

In [None]:
iris = pd.read_csv('./Iris.csv')
iris.drop(columns = 'Id', inplace = True)

In [None]:
iris.head()

In [None]:
# 
print(f'''
Número de amostras: {iris.shape[0]}
Número de colunas : {iris.shape[1]}''')

### Que tipo de dados?

In [None]:
# Nome das colunas

table = PrettyTable()
table.field_names = ['Index', 'Column Name']
for i,name in enumerate(list(iris.columns)):
    table.add_row([i, name])
print(table)

In [None]:
dtype = iris.dtypes.reset_index()
dtype.columns = ['Column Name', 'data type']
print(tabulate(dtype, headers='keys', tablefmt='psql'))

### O que é species?

In [None]:
# Nome das colunas

table = PrettyTable()
table.field_names = ['Index', 'Specie']
for i,specie in enumerate(list(iris['Species'].unique())):
    table.add_row([i, specie])
print(table)

**Três espécies diferentes**

### Quantidade de amostras por espécies

In [None]:
qtd_species = iris.groupby('Species').size().reset_index()
qtd_species.columns = ['Species', 'quantidade']
print(tabulate(qtd_species, headers='keys', tablefmt='psql'))

**Dataset bem distribuído entre as amostras**

In [None]:
iris.describe().T

### Gráfico de barras

In [None]:
fig, ax = plt.subplots(1,4, figsize = (17,7), sharey = True)
for n, col in enumerate(list(iris.columns)[0:4]):
    sns.barplot( x = col, y = 'Species', data = iris, ax = ax[n])



### Problemas com o gráfico de barras

In [None]:
random_bar1 = list(np.random.normal(100, 10, 100))
random_bar2 = list(np.random.normal(50, 15, 50))
random_bar2 += list(np.random.normal(150, 15, 50))


In [None]:
random_df = pd.DataFrame({1: random_bar1,
             2: random_bar2}).melt()

In [None]:
sns.barplot(x = 'variable', y = 'value', data = random_df)

#### Aparentemente as distribuições são semelhantes

In [None]:
sns.boxplot(x = 'variable', y = 'value', data = random_df)

In [None]:
sns.violinplot(x = 'variable', y = 'value', data = random_df)

In [None]:
sns.kdeplot(data = random_df[random_df.variable == 1]['value'], label = 'Value 1')
sns.kdeplot(data = random_df[random_df.variable == 2]['value'], label = 'Value 2')

### A importância de visualizar se os dados são normais ou não

Se assumirmos que a distribuição é normal o teste mais obvio seria o test t.

Porém uma pergunta melhor seria se as distribuições são semelhantes ou não assumindo que elas não são normais.

Aplicando dessa forma o teste de Kolmogorov–Smirnov

In [None]:
print(f't-test                 : {stats.ttest_ind(random_bar1, random_bar2)[1]:.2f}')
print(f'Kolmogorov–Smirnov test: {stats.ks_2samp(random_bar1, random_bar2)[1]:.2e}')

### Histogramas

In [None]:
fig, ax = plt.subplots(figsize = (17,5))
sns.kdeplot(iris.SepalLengthCm)

In [None]:
fig, ax = plt.subplots(1,4, figsize = (17,5))
for n, col in enumerate(list(iris.columns)[0:4]):
    sns.kdeplot(data = iris[iris.Species == iris.Species.unique()[0]][col],ax = ax[n], label = iris.Species.unique()[0])
    sns.kdeplot(data = iris[iris.Species == iris.Species.unique()[1]][col], ax = ax[n], label = iris.Species.unique()[1])
    sns.kdeplot(data = iris[iris.Species == iris.Species.unique()[2]][col], ax = ax[n], label = iris.Species.unique()[2])
    ax[n].title.set_text(col)

### Boxplots, violinplots e swarmplots

In [None]:
fig, ax = plt.subplots(1,4, figsize = (17,7), sharey = True)
for n, col in enumerate(list(iris.columns)[0:4]):
    sns.boxplot( x = col, y = 'Species', data = iris, ax = ax[n],showfliers=False)



In [None]:
fig, ax = plt.subplots(1,4, figsize = (17,7), sharey = True)
for n, col in enumerate(list(iris.columns)[0:4]):
    sns.violinplot( x = col, y = 'Species', data = iris, ax = ax[n],showfliers=False)



In [None]:
fig, ax = plt.subplots(1,4, figsize = (17,7), sharey = True)
for n, col in enumerate(list(iris.columns)[0:4]):
    sns.swarmplot( x = col, y = 'Species', data = iris, ax = ax[n])



In [None]:
a = sns.clustermap(iris.drop(columns = 'Species'), col_cluster = False)

In [None]:
lut = dict(zip(iris['Species'].unique(), "rbg"))
row_colors = iris['Species'].map(lut)
g = sns.clustermap(iris.drop(columns = 'Species'), col_cluster = False, row_colors= row_colors)

for label in iris['Species'].unique():
    g.ax_col_dendrogram.bar(0, 0, color=lut[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=6, fontsize = 14)

### Normalizando os dados por z-score

**z = (x – μ) / σ**

In [None]:
iris_norm = iris.copy()

In [None]:
for col in list(iris_norm.columns)[0:4]:
    iris_norm[col + '_z'] = stats.zscore(iris_norm[col])


In [None]:
iris_norm.columns

In [None]:
iris_norm.describe().T

In [None]:
fig, ax = plt.subplots(2,4, figsize = (17,7), sharey = True)
for y in range(2):
    if y == 0:
        d = iris
    else: d = iris_norm
    for n, col in enumerate(list(iris_norm.columns)[-4:]):
        if y == 0:
            col = col.replace('_z', '')
        sns.boxplot( x = col, y = 'Species', data = d, ax = ax[y][n],showfliers=False)

In [None]:
fig, ax = plt.subplots(2,4, figsize = (17,7), sharey = True)
for y in range(2):
    if y == 0:
        d = iris
    else: d = iris_norm
    for n, col in enumerate(list(iris_norm.columns)[-4:]):
        if y == 0:
            col = col.replace('_z', '')
        sns.swarmplot( x = col, y = 'Species', data = d, ax = ax[y][n])

In [None]:
lut = dict(zip(iris['Species'].unique(), "rbg"))
row_colors = iris_norm['Species'].map(lut)
g = sns.clustermap(iris_norm.drop(columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species']), col_cluster = False, row_colors= row_colors)

for label in iris_norm['Species'].unique():
    g.ax_col_dendrogram.bar(0, 0, color=lut[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=6, fontsize = 14)

### Normalizando de forma diferente

In [None]:
iris_norm2 = iris.copy()
for col in list(iris_norm2.columns)[0:4]:
    iris_norm2[col + '_norm'] = iris_norm2[col]/iris_norm2[col].sum()


In [None]:
iris_norm2.head()

In [None]:
iris_norm2.describe().T

In [None]:
fig, ax = plt.subplots(3,4, figsize = (17,10), sharey = True)
for y in range(3):
    if y == 0:
        d = iris
    elif y == 1:
        d = iris_norm
    else: d = iris_norm2
    for n, col in enumerate(list(iris_norm.columns)[-4:]):
        if y == 0:
            col = col.replace('_z', '')
        if y == 2:
            col = col.replace('_z', '_norm')
        sns.boxplot( x = col, y = 'Species', data = d, ax = ax[y][n],showfliers=False)

In [None]:
fig, ax = plt.subplots(3,4, figsize = (17,10), sharey = True)
for y in range(3):
    if y == 0:
        d = iris
    elif y == 1:
        d = iris_norm
    else: d = iris_norm2
    for n, col in enumerate(list(iris_norm.columns)[-4:]):
        if y == 0:
            col = col.replace('_z', '')
        if y == 2:
            col = col.replace('_z', '_norm')
        sns.swarmplot( x = col, y = 'Species', data = d, ax = ax[y][n])

In [None]:
lut = dict(zip(iris['Species'].unique(), "rbg"))
row_colors = iris_norm['Species'].map(lut)
g = sns.clustermap(iris_norm2.drop(columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species']), col_cluster = False, row_colors= row_colors)

for label in iris_norm['Species'].unique():
    g.ax_col_dendrogram.bar(0, 0, color=lut[label],
                            label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=6, fontsize = 14)

### O início de uma análise multivariada...

In [None]:
sns.pairplot(iris, hue="Species")

By default, this function will create a grid of Axes such that each variable in data will by shared in the y-axis across a single row and in the x-axis across a single column. The diagonal Axes are treated differently, drawing a plot to show the univariate distribution of the data for the variable in that column.