# Examples - Visualisation with Pandas

In [None]:
# imports

import pandas as P

## Loading and preprocessing data

In [None]:
# human-friendly variable names
column_names = ['age', 'sex', 'pain', 'blood_pressure',
                'cholesterol', 'blood_sugar', 'resting_ECG', 'max_heart_rate',
                'induced_angina', 'induced_ST_depression',
                'slope', 'coloured_vessels', 'thal', 'disease']

# 'Heart Disease' data set
heart = P.read_table('heart.dat', sep=" ", names=column_names)

In [None]:
# categorical variables and their human-friendly values
categorical = {
    'sex': ('female', 'male'),
    'pain': ('typical', 'atypical', 'non-anginal', 'asymptomatic'),
    'blood_sugar': ('no', 'yes'),
    'resting_ECG': ('normal', 'abnormality', 'hypertrophy'),
    'induced_angina': ('no', 'yes'),
    'thal': ('normal', 'fixed', 'reversable'),
    'slope': ('upsloping', 'flat', 'downsloping'),
    'disease': ('no', 'yes'),
    }

# update dataframe to make variables categorical
for column in categorical:
    s = P.Categorical(heart[column]).rename_categories(categorical[column])
    heart[column] = s
# use integers for 'coloured vessels' because they are counts
heart.coloured_vessels = heart.coloured_vessels.astype('int')

# show the first few observations
heart.head()

## Bar plots

In [None]:
# a bar plot of 'pain' (nominal)
heart.pain.value_counts().plot.bar(title="Pain type", ylabel="Frequency")

In [None]:
# a bar plot of 'coloured vessels' (discrete quantitative)
counts = heart.coloured_vessels.value_counts()
counts.plot.bar(title="Coloured vessel count", ylabel="Frequency")

In [None]:
# a table of the frequencies of 'sex'-'pain' combinations
counts = heart.groupby(['sex','pain']).size().unstack()
counts

In [None]:
# a bar plot with bars side-by-side
counts.plot.bar(title="Pain type", ylabel="Frequency")

In [None]:
# a bar plot with bars above each other
counts.plot.bar(title="Pain type", ylabel="Frequency", stacked=True)

In [None]:
# since the groups are of different sizes, the plot of
# relative frequencies may be more informative

# totals by value of 'sex' (i.e. sums over columns)
totals = counts.sum(axis=1)
# relative frequencies per value of 'sex'
# (i.e. divide each element by the total that corresponds to its row name)
relative_counts = counts.div(totals, axis=0)
relative_counts.plot.bar(title="Pain type", ylabel="Frequency")

## Histograms

In [None]:
# a histogram of 'age'
# (the number of intervals determined by an algorithm)
heart.age.plot.hist(title="Age", xlabel="Age")

In [None]:
# a histogram with fewer intervals
heart.age.plot.hist(bins=5, title="Age", xlabel="Age")

In [None]:
# a histogram with more intervals
heart.age.plot.hist(bins=35, title="Age", xlabel="Age")

# Note how the level of details (and noise) changes.
# Note also how the histogram reveals the gaps in the
# observed ages.

In [None]:
# a histogram of 'age' for each value of 'sex'
heart.groupby(['sex']).age.plot.hist(bins=11, range=(25,80),
                                     alpha=0.5, legend=True)

## Box plots

In [None]:
# a box plot of 'age' (over the whole data)
heart.age.plot.box(title="Age", xlabel="Age")

In [None]:
# a box plot of 'age' for each value of 'sex'
heart.groupby('sex').boxplot(column='age', grid=False)

## Scatter plots

In [None]:
# a scatter plot of 'blood pressure' vs. 'cholesterol'
heart.plot.scatter('blood_pressure','cholesterol',
                   title="Blood pressure vs. serum cholesterol",
                   xlabel="Blood pressure",
                   ylabel="Serum cholesterol (mg/dl)",
                   figsize=(8,8))

In [None]:
# scatter plots of all pairs of quantitative variables
# (the assignment suppresses a long output but not the plot itself)
dontprint = P.plotting.scatter_matrix(heart, figsize=(12,12))

# Age seems to be slightly correlated (linearly) to maximum
# heart rate. It is difficult to see, though, because there
# is a lot of variation.