# Lecture 17- Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lets load up some of the data from last lecture and investigate how to visualize it.

In [None]:
df = pd.read_csv("heart.csv")

In [None]:
df

## Matplotlib

`matplotlib` is the most commonly used plotting tool in the python ecosystem. There is extensive documentation, and you are encouraged to look through the [Tutorials](https://matplotlib.org/3.3.3/tutorials/index.html).

But before we dive into the techinical aspects of making plots, let's first think of what type of plots may be useful in a data science context.


## Histograms

As discussed throughout the course, a dataset typically consists of data points, each of which is a set of observations of the same set of random variables. What we are describing is a table, with each row corresponding to a data point, and each column a random variable. 


In [None]:
plt.hist(df["chol"])

In [None]:
plt.hist(df["chol"],bins=50)

In [None]:
_=plt.hist(df["chol"],bins=50)

In [None]:
cp_0_selection=df["cp"]==0
cp_not_0_selection=df["cp"]!=0

In [None]:
_=plt.hist(df[cp_0_selection]["chol"],bins=50)
_=plt.hist(df[cp_not_0_selection]["chol"],bins=50)

In [None]:
_=plt.hist(df[cp_0_selection]["chol"],bins=50,label="CP 0")
_=plt.hist(df[cp_not_0_selection]["chol"],bins=50,label="CP not 0")
_=plt.legend()

In [None]:
_=plt.hist(df[cp_0_selection]["chol"],bins=50,alpha=.5,color='b',label="CP 0")
_=plt.hist(df[cp_not_0_selection]["chol"],bins=50,alpha=.5,color='r',label="CP not 0")
_=plt.legend()

In [None]:
print(np.sum(cp_0_selection))
print(np.sum(cp_not_0_selection))

In [None]:
_=plt.hist(df[cp_0_selection]["chol"],bins=50,alpha=.5,color='b',density=1,label="CP 0")
_=plt.hist(df[cp_not_0_selection]["chol"],bins=50,alpha=.5,color='r',density=1,label="CP not 0")
_=plt.legend()

In [None]:
def compare_distributions(df,column_name,selections,**kwargs):
    for label,selection in selections.items():        
        _=plt.hist(df[selection][column_name],label=label,**kwargs)

    _=plt.legend()


In [None]:
compare_distributions(df,"chol",
                     {"CP 0":df["cp"]==0.,
                      "CP Not 0":df["cp"]!=0},
                     alpha=0.5,
                     density=1,
                     bins=50,
                     )

In [None]:
selection_dict={"CP 0":df["cp"]==0.,"CP Not 0":df["cp"]!=0}

for column_name in df.columns:
    
    compare_distributions(df,column_name,
                     selection_dict,
                     alpha=0.5,
                     density=1,
                     bins=50,
                     )
    plt.show()

In [None]:
len(df.columns)

In [None]:
selection_dict={"CP 0":df["cp"]==0.,"CP Not 0":df["cp"]!=0}

plt.figure(figsize=(15,15))

for i,column_name in enumerate(df.columns):
    plt.subplot(5,3,i+1)
    compare_distributions(df,column_name,
                     selection_dict,
                     alpha=0.5,
                     density=1,
                     bins=50,
                     stacked=True
                     )
    plt.xlabel(column_name)
plt.show()

## 1-D Plots

In [None]:
import math

x = np.linspace(-2.*math.pi,2.*math.pi,100)
y = list(map(math.sin,x))

_=plt.plot(x,y)

In [None]:
x = np.linspace(-2.*math.pi,2.*math.pi,100)
y_1 = list(map(math.sin,x))
y_2 = list(map(math.cos,x))

_=plt.plot(x,y_1,label="sin")
_=plt.plot(x,y_2,label="cos")

_=plt.legend()

In [None]:
df.columns

In [None]:
_=plt.plot(df["thalach"],df["chol"])

## 2-D Plots

In [None]:
_=plt.scatter(df["thalach"],df["chol"])

In [None]:
selection_dict={"CP 0":df["cp"]==0.,"CP Not 0":df["cp"]!=0}

for label,selection in selection_dict.items():   
    _=plt.scatter(df[selection]["thalach"],df[selection]["chol"],label=label)

_=plt.xlabel("thalach")
_=plt.ylabel("chol")

_=plt.legend()


In [None]:
selection_dict={"CP 0":df["cp"]==0.,"CP Not 0":df["cp"]!=0}

def compare_scatter(df,x_var_name,y_var_name,selections,make_legend=True) :
    for label,selection in selections.items():   
        _=plt.scatter(df[selection][x_var_name],df[selection][y_var_name],label=label)

    _=plt.xlabel(x_var_name)
    _=plt.ylabel(y_var_name)

    if make_legend:
        _=plt.legend()



In [None]:
compare_scatter(df,"thalach","chol",selection_dict)

In [None]:
for x_var_name in df.columns:
    for y_var_name in df.columns:
        compare_scatter(df,x_var_name,y_var_name,selection_dict)
        plt.show()


In [None]:
columns = df.columns[1:6]
n_columns=len(columns)
plt.figure(figsize=(15,15))

plot_i=0
for i,x_var_name in enumerate(columns):
    for j,y_var_name in enumerate(columns):
        plot_i+=1
        plt.subplot(n_columns,n_columns,plot_i)
        make_legend = plot_i==1
        compare_scatter(df,x_var_name,y_var_name,selection_dict,make_legend=make_legend)



In [None]:
# Pair plot

columns = df.columns[1:6]
n_columns=len(columns)
plt.figure(figsize=(15,15))

plot_i=0
for i,x_var_name in enumerate(columns):
    for j,y_var_name in enumerate(columns):
        plot_i+=1
        plt.subplot(n_columns,n_columns,plot_i)
        make_legend = plot_i==1
        if i==j:
            compare_distributions(df,x_var_name,
                     selection_dict,
                     alpha=0.5,
                     density=1,
                     bins=50,
                     )
        else:
            compare_scatter(df,x_var_name,y_var_name,selection_dict,make_legend=make_legend)

In [None]:
import seaborn as sns


In [None]:
sns.pairplot(df,hue="cp")