# Table of Contents 


- [The IRIS dataset](#The-IRIS-dataset):
    - [Load the dataset](#Load-the-dataset)
    - [Explore the dataset: Descriptive statistics](#Explore-the-dataset:-descriptive-statistics)
    - [Explore the dataset: Visualization](#Explore-the-dataset:-Visualization)
- [The BREAST-CANCER dataset](#The-BREAST-CANCER-dataset):
    - [Load the dataset](#Load-the-Dataset)
    - [Explore the dataset: Descriptive statistics](#Explore-the-dataset:-Descriptive-statistics)
    - [Explore the dataset: Visualization](#Explore-the-dataset:-Visualization)
    


In [None]:
import os
import pandas as pd

# Data Exploration (numerical data): The IRIS dataset

This is perhaps the best known database to be found in the pattern recognition literature. 

- The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.
- There are four numeric attributes and the class attribute:
    1. sepal length in cm   
    2. sepal width in cm   
    3. petal length in cm   
    4. petal width in cm   
    5. class: {Iris Setosa, Iris Versicolour, Iris Virginica}
    

![irisdataset](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*3svxt5IDoVHkGLWvf4IAqg.png)

## Load the dataset

Common pitfalls in `pd.read_csv`:
- what is the "sep" character
- is there any header?
- is there any index column?
- how are missing/unknown values denoted?

In [None]:
iris_df = pd.read_csv(os.path.join('dataset', 'iris.csv'))

## Explore the dataset: descriptive statistics


In [None]:
iris_df

In [None]:
iris_df.shape

In [None]:
pd.set_option('display.max_rows', 150)
iris_df

In [None]:
iris_df.head(10)

In [None]:
iris_df.head(10).T

Check if there is any missing value

In [None]:
iris_df.isna().sum()

In [None]:
iris_df.describe()

In [None]:
iris_df.drop('class', axis = 1).describe()

In [None]:
iris_df.info()

In [None]:
iris_df['class'].value_counts()

## Explore the dataset: Visualization

In [None]:
from matplotlib import pyplot as plt

### Histogram and boxplots with matplotlib


In [None]:
plt.figure()
plt.hist(iris_df['sepallength'])
plt.ylabel('occurrences')
plt.xlabel('sepallength')
plt.title(f'histogram of sepal length attribute')

In [None]:
f, axes = plt.subplots(1, 2, sharey = True, figsize = (10, 4))
axes[0].hist(iris_df['sepallength'])
axes[0].set_ylabel('occurrences')
axes[0].set_xlabel('sepal length')
axes[0].set_title(f'histogram of sepal length attribute (default # bins)')
axes[1].hist(iris_df['sepallength'], bins = 20)
axes[1].set_xlabel('sepal length')
axes[1].set_title(f'histogram of sepal length attribute (20 bins)')
plt.show()

### Histogram and boxplots with pandas
[Check the user guide](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html)


In [None]:
iris_df.drop('class', axis = 1).hist(bins = 7, 
                                     layout = (1, 4), 
                                     figsize = (20, 4))
plt.show()

In [None]:
iris_df['sepallength'].plot(kind = 'hist', 
                            bins = 7, 
                            figsize = (10, 6), 
                            title = 'iris hist plot: sepallength')
plt.show()

In [None]:
iris_df.hist(column = 'sepallength', 
             by='class', 
             bins = 7, 
             figsize=(15, 4),
             layout = (1, 3))
plt.show()

In [None]:
iris_df.boxplot(column = 'sepallength',
                by = 'class', 
                figsize = (10, 6))
plt.show()

In [None]:
iris_df.drop('class', axis = 1).plot(kind ='box', 
                                    subplots = True, 
                                    figsize = (20, 4),
                                    layout = (1, 4),
                                    sharey = False)
plt.show()

In [None]:
iris_df.drop('class', axis = 1).plot(kind = 'box', 
                                    subplots = True, 
                                    figsize = (16, 4),
                                    layout = (1, 4), 
                                    sharey = True)
plt.show()

In [None]:
iris_df.drop('class', axis = 1).plot(kind = 'box')
plt.show()

### Scatter plot with matplotlib

In [None]:
dict_names = {1: 'Iris-setosa', 2: 'Iris-versicolor', 3: 'Iris-virginica'}
dict_names.values()

In [None]:
x_index = 0
y_index = 2
for curr_class, color in zip(range(1, 4), 'rgb'):
    scatterplot = plt.scatter(iris_df[iris_df["class"] == curr_class].iloc[:, x_index],
                              iris_df[iris_df["class"] == curr_class].iloc[:, y_index], 
                              c = color,
                              # here you can customize the marker size or style, for instance 
                              label = dict_names[curr_class])
plt.xlabel(iris_df.columns[x_index])
plt.ylabel(iris_df.columns[y_index])
plt.legend()
plt.show()

In [None]:
# analogously: easier on the plotting stage, more complicated for handling legend elements.
x_index = 0
y_index = 2

scatterplot = plt.scatter(iris_df.iloc[:, x_index],
                          iris_df.iloc[:, y_index], 
                          c = iris_df['class'])
plt.xlabel(iris_df.columns[x_index])
plt.ylabel(iris_df.columns[y_index])
plt.legend(handles = scatterplot.legend_elements()[0],
           labels = dict_names.values())
plt.show()

In [None]:
fig, axes = plt.subplots(4, 4, figsize = (20, 20))
for ix in range(4):
    for iy in range(4):
        scatterplot = axes[ix, iy].scatter(iris_df.iloc[:, ix],
                                          iris_df.iloc[:, iy],
                                          c = iris_df['class'])
        axes[ix, iy].set_xlabel(iris_df.columns[ix])
        axes[ix, iy].set_ylabel(iris_df.columns[iy])
        axes[ix, iy].legend(handles = scatterplot.legend_elements()[0],
                            labels = dict_names.values())
        

### Scatter plot with pandas

In [None]:
iris_df.plot(x = 'sepallength', 
             y = 'petallength', 
             kind = 'scatter', 
             c = 'class',
             colormap = 'viridis',
             colorbar = False, 
             figsize = (6, 6),
             title = 'iris scatter plot')

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(iris_df.drop('class', axis = 1),
               figsize = (16, 16),
               alpha = 1,
               diagonal = 'hist',
               c = iris_df['class'])
plt.show()

### Plotting with Seaborn

In [None]:
import seaborn as sns

In [None]:
f, axes = plt.subplots(1, 2, figsize = (8, 4))

sns.scatterplot(data = iris_df, 
                x = "sepallength", 
                y = "petallength", 
                hue = "class", 
                ax = axes[0]) 

sns.histplot(data = iris_df, 
             x = "class", 
             hue = "class", 
             legend = False, 
             ax = axes[1])
f.tight_layout()

In [None]:
sns.jointplot(data = iris_df, 
              x = "sepallength", 
              y = "petallength")

Assigning a hue variable will add conditional colors to the scatterplot and draw separate density curves on the marginal axes:

internally, it uses `kdeplot()`: it plots univariate or bivariate distributions using kernel density estimation.
- A **kernel density estimate** (KDE) plot is a method for visualizing the distribution of observations in a dataset, analagous to a histogram. KDE represents the data using a continuous probability density curve in one or more dimensions.


In [None]:
sns.jointplot(data = iris_df, 
              x = "sepallength",
              y = "petallength",
              hue = 'class')


In [None]:
sns.pairplot(data = iris_df,
             hue = "class")


In [None]:
sns.set_theme(style = "whitegrid")
ax = sns.boxplot(data = iris_df.iloc[:, :-1],
                 orient = "v")

### Correlation Analysis

`SciPy` is a collection of mathematical algorithms and convenience functions built on the NumPy extension of Python. It adds significant power to the interactive Python session by providing the user with high-level commands and classes for manipulating and visualizing data.

SciPy features includes, but are not limited to:
- statistics
- linear algebra
- fourier transform
- optimization algorithm
- ...


In [None]:
from scipy.stats import pearsonr
pearsonr(iris_df.sepallength, iris_df.sepalwidth)

The `pearsonr` function returns:
- Pearson product-moment correlation coefficent.
- The p-value associated with the chosen alternative: it roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.

Pearson correlation coefficient can also be obtained with `pandas.DataFrame.corr()`

In [None]:
iris_df.drop('class', axis = 1).corr()

In [None]:
f, ax = plt.subplots(figsize = (10, 8))
sns.heatmap(iris_df.drop('class', axis = 1).corr(), 
            annot = True, 
            linewidths = .5, 
            fmt = '.2f',
            ax = ax,
            vmin = -1, 
            vmax = 1,
            cmap = "coolwarm")
plt.show()

# Data Exploration (categorical data): the BREAST-CANCER dataset



available at [UCI database](https://archive.ics.uci.edu/ml/datasets/breast+cancer).


This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)
 
This data set includes 286 intances (201 of one class, 85 of another class).  The instances are described by 9 attributes, some of which are ordinal and some are nominal.
 
Attribute information

| column | values |
| --- | --- |
| Class | no-recurrence-events, recurrence-events |
| age | 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99|
| menopause | lt40, ge40, premeno|
| tumor-size | 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59|
| inv-nodes | 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, 27-29, 30-32, 33-35, 36-39|
| node-caps | yes, no|
| deg-malig | 1, 2, 3|
| breast | left, right|
| breast-quad | left-up, left-low, right-up, right-low, central|
| irradiat | yes, no|
 
There are 9 Missing Attribute Values (denoted by "?") 


## Load the Dataset

In [None]:
df = pd.read_csv(os.path.join('dataset', 'breast-cancer.csv'))

In [None]:
df

In [None]:
df.info()

**Issue #1**: values are *quoted* using character " **'** "

In [None]:
df = pd.read_csv(os.path.join('dataset', 'breast-cancer.csv'), quotechar = "'")

In [None]:
df

In [None]:
df.head(3).T

**Issue #2**: are we handling missing values as such?

In [None]:
df["node-caps"].value_counts()

In [None]:
df = pd.read_csv(os.path.join('dataset',
                              'breast-cancer.csv'), 
                 quotechar = "'", 
                 na_values = '?')


In [None]:
df["node-caps"].value_counts()

In [None]:
df["node-caps"].value_counts(dropna = False)

## Explore the dataset: Descriptive statistics


In [None]:
df

In [None]:
df.head(10)

In [None]:
df.head(2).T

In [None]:
df.shape

Check if there is any missing value

In [None]:
df.isna().sum()

In [None]:
df[df.isna().any(axis=1)]

In [None]:
df.age.unique()

In [None]:
df.info()

Most columns are recognized as generic "object" (text or mixed numeric and non-numeric values).
Actually, we know from the documentation that we are dealing with categorical variables.

`Pandas` has a **categorical** data type, which may be useful in the following cases:
- A string variable consisting of only a few different values. Converting such a string variable to a categorical variable will save some memory.
- The lexical order of a variable is not the same as the logical order (“one”, “two”, “three”). By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order, see here.
- As a signal to other Python libraries that this column should be treated as a categorical variable (e.g. to use suitable statistical methods or plot types).

`Categoricals` are a pandas data type corresponding to categorical variables in statistics. A categorical variable takes on a limited, and usually fixed, number of possible values (categories). Examples are gender, social class, blood type, country affiliation, observation time or rating via Likert scales.

In contrast to statistical categorical variables, categorical data might have an order (e.g. 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, …) are not possible.



In [None]:
df.describe()

By default, describe considers only numeric data

In [None]:
df.describe(include = 'object')

In [None]:
df.describe(include = 'all') # mixed data type: description still supported

We can cast our dataframe to categorical, as we know that this is the case.

In [None]:
df = df.astype('category')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
pd.set_option('display.max_rows', None)
df.sort_values(by = 'inv-nodes').tail(80)

**Another issue**: we are not using the proper order as the lexical one is not the same as the logical one: for example (0-2 < 3-5 < 10-12) but ("0-2" < "10-12" < "3-5")

In [None]:
from pandas.api.types import CategoricalDtype 
# the following categories are available in the dataset description ("attribute information") at the beginning of this notebook
categories = CategoricalDtype(["0-2", 
                               "3-5", 
                               "6-8", 
                               "9-11", 
                               "12-14", 
                               "15-17", 
                               "18-20", 
                               "21-23", 
                               "24-26", 
                               "27-29", 
                               "30-32", 
                               "33-35",
                               "36-39"], 
                              ordered = True)
# Inv-nodes: the number (range 0 - 39) of axillary lymph nodes
# that contain metastatic breast cancer visible on histological examination.
df['inv-nodes'] = df['inv-nodes'].astype(categories)


In [None]:
df.info()

In [None]:
df.sort_values(by = 'inv-nodes').tail(80)

## Explore the dataset: Visualization

*Barplot* and *pie-chart* in pandas (shown just for an attribute, but applies to all)

In [None]:
value_count = df['breast-quad'].value_counts(dropna = False) 
value_count.plot(kind = 'bar')
plt.title('breast-quad')
plt.show()


In [None]:
value_count.plot(kind = "pie")
plt.show()

*Barplot* in seaborn.

`seaborn` axes-level functions for [plotting categorical data](https://seaborn.pydata.org/tutorial/categorical.html):
- categorical scatter plots
    - `stripplot()`
    - `swarmplot()`
- distribution plots
    - `boxplot()`
    - `violinplot()`
    - `boxenplot()`
- estimate plots
    - `pointplot()`
    - `barplot()`
    - `countplot()`

In [None]:
sns.countplot(x = "age", 
              hue = "age", 
              data = df, 
              palette = "pastel",
              legend = False)
plt.show()

In [None]:
sns.countplot(x = "age", 
              data = df, 
              hue = "Class", 
              palette = "pastel")
plt.show()

`seaborn` also provides a figure-level interface, `catplot()`, that gives unified higher-level access to the axes-level functions.

In [None]:
sns.catplot(x = "age",
            # x = 'age' --> vertical
            hue = "Class",
            kind = "count",
            palette = "pastel", 
            edgecolor = ".5",
            data = df)

In [None]:
sns.catplot(y = "node-caps", 
            # y = 'node-caps' --> horizontal
            hue = "Class", 
            kind = "count",
            palette = "pastel",
            edgecolor = ".5",
            data = df)

In [None]:
sns.catplot(y = "inv-nodes",
            hue = "Class",
            kind = "count",
            palette = "pastel",
            edgecolor = ".5",
            data = df)

Notice the effect of customizing the order and domain of the categorical variable.
- restore the original dataset
- plot with the same catplot statement

In [None]:
df = pd.read_csv(os.path.join('dataset', 'breast-cancer.csv'), 
                 quotechar = "'", 
                 na_values = '?')

In [None]:
sns.catplot(y = "inv-nodes",
            hue = "Class",
            kind = "count",
            palette = "pastel",
            edgecolor = ".5",
            data = df)

## Aside: Anscombe Quartet

In [None]:
# source: https://matplotlib.org/stable/gallery/specialty_plots/anscombe.html
import matplotlib.pyplot as plt
import numpy as np

x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]

datasets = {
    'I': (x, y1),
    'II': (x, y2),
    'III': (x, y3),
    'IV': (x4, y4)
}

In [None]:
fig, axs = plt.subplots(2, 2, 
                        sharex = True,
                        sharey = True, 
                        figsize = (6, 6),
                        gridspec_kw = {'wspace': 0.08, 'hspace': 0.08})
axs[0, 0].set(xlim = (0, 20), ylim = (2, 14))
axs[0, 0].set(xticks = (0, 10, 20), yticks = (4, 8, 12))

for ax, (label, (x, y)) in zip(axs.flat, datasets.items()):
    ax.grid()
    ax.text(0.1, 0.9, label, fontsize = 20, transform = ax.transAxes, va = 'top')
    ax.tick_params(direction = 'in', top = True, right = True)
    ax.plot(x, y, 'o')

plt.show()

What’s special in these scatter plots?

In [None]:
fig, axs = plt.subplots(2, 2, 
                        sharex = True,
                        sharey = True, 
                        figsize = (6, 6),
                        gridspec_kw = {'wspace': 0.08, 'hspace': 0.08})
axs[0, 0].set(xlim = (0, 20), ylim = (2, 14))
axs[0, 0].set(xticks = (0, 10, 20), yticks = (4, 8, 12))

for ax, (label, (x, y)) in zip(axs.flat, datasets.items()):
    ax.grid()
    ax.text(0.1, 0.9, label, fontsize = 20, transform = ax.transAxes, va = 'top')
    ax.tick_params(direction = 'in', top = True, right = True)
    ax.plot(x, y, 'o')

###################### NEW PART ######################
    # linear regression
    p1, p0 = np.polyfit(x, y, deg = 1)  # slope, intercept
    ax.axline(xy1 = (0, p0), slope = p1, color = 'r', lw = 2)

    # add text box for the statistics
    stats = (f'$\\mu$ = {np.mean(y):.2f}\n'
             f'$\\sigma$ = {np.std(y):.2f}\n'
             f'$r$ = {np.corrcoef(x, y)[0][1]:.2f}')
    bbox = dict(boxstyle = 'round', fc = 'blanchedalmond', ec = 'orange', alpha = 0.5)
    ax.text(0.95, 0.07, stats, fontsize = 9, bbox = bbox,
            transform = ax.transAxes, horizontalalignment = 'right')
###################### NEW PART ######################

plt.show()

What’s special in these scatter plots?
- four data sets, each with eleven points
- nearly identical simple descriptive statistics
- very different when graphed

**Lessons learned:**
- importance of graphing data
- effect of outliers on statistical properties
- statistic properties are important but may be not sufficient