# Dispersion Measures

This notebook is intended to provide a brief introduction to the dispersion measures (range, standard deviation and variance).

In statistics, the measures of dispersion help to interpret the variability of data i.e. to know how much homogenous or heterogeneous the data is. In simple terms, it shows how squeezed or scattered the variable is.

## Range

### Definition
The difference between the lowest and highest values. It can give you a rough idea of how the outcome of the data set will be before you look at it actually.

### Formula / Procedure to find it

The equation to get the range is simple and it is shown next: $\displaystyle R = x_{max} - x_{min}$ <br>
It is the substraction of the maximum value of the dataset and the minimum value.

### Visualization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Creating the data set

In [None]:
data_set = [5,  2,  6, 10,  9,  3,  1,  9,  6,  2,  1,  4,  6,  5,  0,  4,  6,
            5,  4,  6,  3,  0,  2,  8,  6,  7,  8,  0,  4,  7,  0,  1,  0,  8,
            2,  0, 10,  2,  6,  6,  0,  5,  2,  0, 10,  3,  9,  8,  4,  7,  6,
            1, 10,  7, 10,  3,  0,  6,  5,  8,  4,  3,  7,  3,  1,  5,  3,  0,
            3,  3,  1,  2,  1,  5,  0,  5,  8,  1, 10,  7,  8,  6,  9,  3,  3,
            7,  3,  4, 10,  8,  2,  0,  0,  2,  9,  0,  5,  5,  5,  6]

The range can be easily visualized with two points. A 1D or 2D plot should be enough to visualize the space in which the dataset exists. Also a boxplot would be useful as it shows the limits and some quartiles of the data.

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1)

fig.add_trace(
    go.Scatter(y=data_set,
               name="2D data_set"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(data_set), np.min(data_set)],
               name="2D min"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(data_set), np.max(data_set)],
               name="2D max"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=data_set, y=np.zeros(len(data_set)),
               mode='markers',
               name="1D data_set"),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=[np.min(data_set)], y=[0],
               mode='markers',
               name="1D min"),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=[np.max(data_set)], y=[0],
               mode='markers',
               name="1D max"),
    row=2, col=1
)

fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set - Mean: {np.mean(data_set)}")
fig.show()

In [None]:
plt.figure(figsize=(16,4))
plt.boxplot(data_set, 'h', vert=False)
plt.vlines(x=np.min(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.max(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.ylim([0.9,1.1])
plt.title(f"Boxplot of data set - Min: {np.min(data_set)} - Max: {np.max(data_set)}")
plt.show()

Note: Good representations of RANGE are 1d, 2D and boxplots.

## Standard Deviation

### Definition
The standard deviation is a measure of the amount of variation or dispersion of a set of values. A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range.

One common usage of standard deviation is the concept of 1, 2 and 3 sigma ($\sigma$) which represents the amount of control in certain process, being this the range of data inside 1, 2 or 3 standard deviation (mean +/- std).

### Formula / Procedure to find it

To find the standard deviation you must perform the next equation: $\displaystyle \sigma = \sqrt{\frac{\sum\mid x - \bar{x}\mid^2}{n}}$

### Creating the data set

We will be using the same dataset (previously created).

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
mean = np.mean(data_set)

fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(y=data_set,
               name="data_set"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[mean-np.std(data_set), mean-np.std(data_set)],
               name="1 Standard Deviation"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[mean+np.std(data_set), mean+np.std(data_set)],
               name="1 Standard Deviation"),
    row=1, col=1
)

fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set - Standard Deviation: {np.std(data_set)}")
fig.show()

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
sns.distplot(data_set)
plt.vlines(x=np.mean(data_set)-np.std(data_set), ymin=0, ymax=0.12, colors='r')
plt.vlines(x=np.mean(data_set)+np.std(data_set), ymin=0, ymax=0.12, colors='r')
plt.vlines(x=np.mean(data_set)-2*np.std(data_set), ymin=0, ymax=0.12, colors='g')
plt.vlines(x=np.mean(data_set)+2*np.std(data_set), ymin=0, ymax=0.12, colors='g')
plt.vlines(x=np.mean(data_set)-3*np.std(data_set), ymin=0, ymax=0.12, colors='b')
plt.vlines(x=np.mean(data_set)+3*np.std(data_set), ymin=0, ymax=0.12, colors='b')
plt.ylim([0,0.12])
plt.legend(["-1 Std", "+1 Std", "-2 Std", "+2 Std", "-3 Std", "+3 Std", "Distribution"])
plt.title(f"Distribution of data set - Standard Deviation: {np.std(data_set)}")

plt.subplot(2,1,2)
plt.boxplot(data_set, 'h', vert=False)
plt.vlines(x=np.mean(data_set)-np.std(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.mean(data_set)+np.std(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.mean(data_set)-2*np.std(data_set), ymin=0.9, ymax=1.1, colors='g')
plt.vlines(x=np.mean(data_set)+2*np.std(data_set), ymin=0.9, ymax=1.1, colors='g')
plt.vlines(x=np.mean(data_set)-3*np.std(data_set), ymin=0.9, ymax=1.1, colors='b')
plt.vlines(x=np.mean(data_set)+3*np.std(data_set), ymin=0.9, ymax=1.1, colors='b')
plt.ylim([0.9,1.1])
plt.title(f"Boxplot of data set - Standard Deviation: {np.std(data_set)}")
plt.show()

Note: Distplots and boxplots are a good way to observe the STANDARD DEVIATION.

## Variance

### Definition
Variance is the expectation of the squared deviation of a random variable from its mean. Informally, it measures how far a set of numbers is spread out from their average value. Variance is important to get the normal distribution of a dataset. It is hard to use variance in most applications, so we use standard deviation. But variance has some advantages in the field of math theory. Also for the correlation of variables it is fundamental.

### Formula / Procedure to find it

To find the variance you must perform the next equation: $\displaystyle \sigma^2 = \frac{\sum\mid x - \bar{x}\mid^2}{n}$

We will be using the first data set (the one used in RANGE and STANDARD DEVIATION section).<br>

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
mean = np.mean(data_set)

fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(y=data_set,
               name="data_set"),
    row=1, col=1
)

# fig.add_trace(
#     go.Scatter(y=np.power(data_set,2),
#                name="data_set_squared"),
#     row=1, col=1
# )

fig.add_trace(
    go.Scatter(x=[0, 99], y=[mean-np.var(data_set), mean-np.var(data_set)],
               name="Variance"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[mean+np.var(data_set), mean+np.var(data_set)],
               name="Variance"),
    row=1, col=1
)

fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set - Variance: {np.var(data_set)}")
fig.show()

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
sns.distplot(data_set)
plt.vlines(x=np.mean(data_set)-np.var(data_set), ymin=0, ymax=0.12, colors='r')
plt.vlines(x=np.mean(data_set)+np.var(data_set), ymin=0, ymax=0.12, colors='r')
plt.ylim([0,0.12])
plt.legend(["-1 Std", "+1 Std","Distribution"])
plt.title(f"Distribution of data set - Standard Deviation: {np.std(data_set)}")

plt.subplot(2,1,2)
plt.boxplot(data_set, 'h', vert=False)
plt.vlines(x=np.mean(data_set)-np.var(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.mean(data_set)+np.var(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.ylim([0.9,1.1])
plt.title(f"Boxplot of data set - Standard Deviation: {np.std(data_set)}")
plt.show()

You can observe that variance sets the limits of a normal distribution as it theoreticaly determines how spread the values could be for certain distribution.

## Excercise

You can test your learning of the introduction of central tendency measures next:

In [None]:
class test:
    def __init__(self):
        self.questions = list()
        self.answers = list()
        self.correct_answers = 0
        self.score = 0

    def add_element(self, q, a):
        self.questions.append(q)
        self.answers.append(a)

    def remove_element(self, index):
        self.questions.pop(index)
        self.answers.pop(index)
        
    def show_answer(self, index):
        print(f"Q{index}: {self.questions[index-1]} - Ans_{index}: {self.answers[index-1]}")
    
    def show_answers(self):
        for index, (q, a) in enumerate(zip(self.questions, self.answers)):
            print(f"Q{index+1}: {q} - Ans_{index+1}: {a}")
    
    def build_from_csv(self, filename):
        df = pd.read_csv(filename)
        for index in range(df.shape[0]):
            self.add_element(df['Questions'][index], df['Answers'][index])
    
    def visualize_score(self):
        fig = go.Figure(data=[go.Pie(labels=["Correct", "Incorrect"],
                                     values=[self.score, 100-self.score],
                                     marker_colors=['rgb(10,100,10)', 'rgb(230,70,70)'],
                                     hole=.3)])
        fig.show()

    def test(self):
        self.correct_answers = 0
        for index, (q, a) in enumerate(zip(self.questions, self.answers)):
            current_answer = ''
            while len(str(current_answer))==0:
                current_answer = input(f"Q{index+1}: " + q)
                if len(current_answer)>0:
                    current_answer = np.round(float(current_answer),2)
                    self.correct_answers += int(current_answer == a)
                    if a==current_answer:
                        print("Correct")
                    else:
                        print("Incorrect")
        self.score =  100*np.sum(self.correct_answers)/len(self.questions)
        
        print(f"Your score: {self.score}")
        self.visualize_score()

In [None]:
exam = test()
exam.build_from_csv("https://raw.githubusercontent.com/Ricardo-DG/data_analytics_training/main/dispersion_test.csv")

In [None]:
# If you would like to see the answers uncomment and run the following line

# exam.show_answers()

In [None]:
# If you would like to see a specific answer uncomment and run the following line
# (make sure to replace "index" with the number of the question you want to know the answer).

# exam.show_answer(index)

In [None]:
score = exam.test()