In [32]:
'''
Import here useful libraries
Run this cell first for convenience
'''
# Import here useful libraries
import numpy as np
from scipy import stats
import scipy
import warnings
from sympy import symbols
from sympy import integrate
warnings.simplefilter('ignore', DeprecationWarning)

# Chapter 2 - Random Variables

## Discrete Random Variable

### Definition of a Random Variable

- Random variable $X$: mapping from sample space $S$ to a real line $R$
- Numerical value $X(w)$ mapped to each outcome $w$ of a particular experiment

### Probability Mass Function

- Probability Mass Function (p.m.f.): set of probability values $p_i$ assigned to each value taken by the discrete random variable $x_i$
- $ 0 \leq p_i \leq 1 \text{ and } \sum_i p_i = 1$
- Probability: $P(X = x_i) = p_i$

### Cumulative Distribution Function

- Cumulative Distribution Function (CDF): $F(x) = P(X \leq x)$

## Continuous Random Variables

### Probability Density Function

- Probability Density Function (pdf): 
\begin{equation}
    f(x) \geq 0
\end{equation}
\begin{equation}
    \int_{-\infty}^{\infty} f(x) dx = 1
\end{equation}


In [3]:
# Verify if provided function is a probability density function
# Parameters
a = 2
b = 1
x_i = 49.5 # Integration from x_i to x_f
x_f = 50.5
from scipy.integrate import quad
def integrand(x, a, b):
    return 1.5 - 6*(x - 50)**2
I = quad(integrand, x_i, x_f, args=(a,b))
print("Result and error: ", I)

Result and error:  (0.9999999999999989, 1.1102230246251553e-14)


### Cumulative Distribution Function


- Cumulative Distribution Function for continuous Random Variables:
    - $F(x) = \int_{-\infty}^x f(y) dy$
    - $f(x) = \frac{dF(x)}{dx}$
    - $P(a < X < b) = F(b) - F(a)$
    - $P(a < X < b) = P(a \leq X \leq b) = P(a \leq X < b) = P(a < X \leq b)$

## Expectation of a Random Variable

### Expectations of Discrete Random Variables

- Expectation of a discrete random variable $X$ with p.m.f. $p$:
\begin{equation}
    E(X) = \sum_i p_i x_i
\end{equation}

In [4]:
from scipy.stats import rv_discrete

x = [10, 20, 30]
p = [0.2, 0.3, 0.5]
distribution = rv_discrete(values=(x, p))
print("Expected value: ", distribution.expect())

Expected value:  23.0


### Expectation of a Continuous Random Variable

- Expectation of a continuous random variable with p.d.f. $f(x)$
\begin{equation}
    E(X) = \int_{- \infty}^{\infty} xf(x)dx
\end{equation}

In [5]:
from scipy.stats import rv_continuous
a = 49.5 # lower bound
b = 50.5 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return 1.5 - 6*(x - 50)**2
distribution = distribution_gen(name='Continuous distribution')
print("Expected value: ", distribution.expect(lambda x: 1, lb=a, ub=b))

Expected value:  0.9999999999999989


### Symmetric Random Variables

- Symmetric Random Variables: if $p(x)$ is symmetric around a point $\mu$ so that:
\begin{equation}
    f(\mu + x) = f(\mu -x)
\end{equation}
- In this case, $E(X) = \mu$ is the point of symmetry

### Medians of Random Variables

- Median: for a random variable $X$ its median is the value $x$ such that $F(x) = 0.5$

In [6]:
from scipy.stats import rv_continuous
a = 49.5 # lower bound
b = 50.5 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return 1.5 - 6*(x - 50)**2
distribution = distribution_gen(a=a, b=b)
print("Median: ", distribution.median())

Median:  50.0


## Variance of a Random Variable

### Definition and Interpretation of Variance 

- Variance ($ \sigma ^2$): $Var(X) = E(X - E(X))^2 = E(X^2) - \mu ^2$
- Positive quantity measuring the spread of the distribution about its mean value
- Standard Deviation($\sigma$): $\sqrt{Var(x)}$

In [8]:
from scipy.stats import rv_discrete

x = [10, 20, 30]
p = [0.2, 0.3, 0.5]
distribution = rv_discrete(values=(x, p))
print("Variance: ", distribution.var())
print("Standard Deviation: ", distribution.std())

Variance:  61.0
Standard Deviation:  7.810249675906654


In [44]:
from scipy.stats import rv_continuous
a = 49.5 # lower bound
b = 50.5 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return 1.5 - 6*(x - 50)**2
distribution = distribution_gen(a=a, b=b)
print("Variance: ", distribution.var())
print("Standard Deviation: ", distribution.std())

Variance:  0.049999999999272404
Standard Deviation:  0.22360679774835202


In [66]:
## Why isn't the result correct?
from scipy.stats import rv_continuous
from math import exp
a = 0 # lower bound
b = 10 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return (( (exp(10 -(x)) -1)/(exp(10)-11) * x))
distribution = distribution_gen(a=a, b=b)
print("Variance: ", distribution.var())
print("Standard Deviation: ", distribution.std())

Variance:  2.0423852667674502
Standard Deviation:  1.4291204521549086


In [67]:
## Looks like it works well with polynomials
from scipy.stats import rv_continuous
from math import exp
a = 5 # lower bound
b = 6 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return ( 2/11 * x)
distribution = distribution_gen(a=a, b=b)
print("Variance: ", distribution.var())
print("Standard Deviation: ", distribution.std())

Variance:  0.08310376492194038
Standard Deviation:  0.28827723621878365


### Chebyshev's Inequality

- Chebyshev's Inequality: if $X$ is a random variable with mean $\mu$ and variance $\sigma ^2$ the following holds:
\begin{equation}
    P(\mu -c \sigma \leq X \leq \mu + c \sigma) \geq 1 - \frac{1}{c^2} \text{ for } c \geq 1
\end{equation}

### Quantiles of Random Variables

- Quantiles of Random Variables: $p$-th quantile $x$ of a random variable $X$ is
\begin{equation}
    F(x) = p
\end{equation}
- Upper quartile ($Q_3$): 75th percentile of the distribution
- Lower quartile ($Q_1$): 25th percentile of the distribution
- Interquantile range (IQR): distance between the two quartiles, $Q_3 - Q_1$

In [None]:

'''
from scipy.stats import rv_continuous
a = 49.5 # lower bound
b = 50.5 # upper bound
class distribution_gen(rv_continuous):
    def _pdf(self, x):
        return 1.5 - 6*(x - 50)**2 # Function to return
distribution = distribution_gen(a=a, b=b)
print("Variance: ", distribution.qu())
print("Standard Deviation: ", distribution.std())
'''

## Jointly Distributed Random Variables

### Joint Probability Distributions

- Discrete: 
\begin{equation}   
    P(X = x_i, Y = y_j) = p_{ij} \geq 0 \text{ satisfying } \sum_i \sum_j p_{ij} = 1
\end{equation}

- Continuous: 
\begin{equation}
    f(x,y) \geq 0 \text{ satisfying } \int \int f(x,y) dxdy= 1
\end{equation}



- Joint Cumulative Distribution Function: 
\begin{equation}   
    F(x,y) = P(X \leq x_j, Y \leq y_j)
\end{equation}
    - Discrete:
    \begin{equation}   
        F(x,y) = \sum_{i:x_i \leq x} \sum_{j:y_j \leq y} p_{ij}
    \end{equation}
    - Continuous:
    \begin{equation}   
        F(x,y) = \int_{- \infty}^{ y} \int_{- \infty}^{ x} f(w, z) dwdz
    \end{equation}

### Marginal Probability Distributions

- Marginal probability distribution: obtained by summing or integrating the joint probability distribution over the values of the other random variable
    - Discrete:
    \begin{equation}   
        P(X = x_i) = p_{i+} = \sum_j p_{ij}
    \end{equation}
    - Continuous:
    \begin{equation}   
        f_X(x) = \int_{- \infty}^{\infty} f(x,y)dy
    \end{equation}

### Conditional Probability Distributions

- Probability distribution describing the properties of a random variable $X$ given knowledge of $Y$
    - Discrete:
    \begin{equation}   
        f_{X \mid Y}(x_i \mid y_i) = P(X = x_i \mid Y = y_j) = \frac{p_{ij}}{p_{+j}}
    \end{equation}
    - Continuous:
    \begin{equation}   
        f_{X \mid Y}(x \mid y) = \frac{f(x,y)}{f_Y(y)}
    \end{equation}

### Computation of E(g(X,Y))

- Given $g(x,y)$ function of $x$ and $y$, we have that:
    - Discrete:
    \begin{equation}   
        E(g(X,Y)) = \sum_{x,y} g(x,y)f(x,y)
    \end{equation}
    - Continuous:
    \begin{equation}   
         E(g(X,Y)) = \int_{- \infty}^{\infty} \int_{- \infty}^{\infty} g(x,y)f(x,y)dxdy
    \end{equation}

### Independence and Covariance

- Independence: when two random variables $X$ and $Y$ satisfy:
\begin{equation}
    f(x,y) = f_X(x)f_Y(y) \text{ for all } x \text{ and } y
\end{equation}

- Covariance: $Cov(X,Y) = E(X - E(X))(Y - E(Y)) = E(XY) -E(X)E(Y)$
- May take a positive or negative value
- Independent random variables have a covariance of zero, but the contrary is not always true

- Correlation ($\rho_{XY}$):
    \begin{equation}
        Corr(X,Y) = \frac{Cov(X,Y)}{\sqrt{Var(X)Var(Y)}}
    \end{equation}
- $-1 \leq \rho_{XY} \leq 1$
- The correlation is invariant to linear transformations of $X$ and $Y$

In [30]:
# Calculate the Covariance for discrete random variables

# Input X and Y are in a table with corresponding probabilities
value_x = np.array([1, 2, 3])
value_y = np.array([1, 2, 3, 4])
prob_matrix = np.array([[0.12, 0.08, 0.07, 0.05], [0.08, 0.15, 0.21, 0.13], [0.01, 0.01, 0.02, 0.07]]) # Covariance matrix

# Expectation of x
exp_x =  0
for i in range(len(value_x)):
    exp_x += value_x[i] * np.sum(prob_matrix, axis=1)[i]
print("Expectation of x: ", exp_x)

# Expectation of y
exp_y = 0
for i in range(len(value_y)):
    exp_y += value_y[i] * np.sum(prob_matrix, axis=0)[i]
print("Expectation of y: ", exp_y)
    
# Variance of x
exp_x2 = 0
for i in range(len(value_x)):
    exp_x2 += (value_x[i] ** 2) * np.sum(prob_matrix, axis=1)[i]
var_x = exp_x2 - (exp_x ** 2)
print("Variance of x: ", var_x)

# Variance of y
exp_y2 = 0
for i in range(len(value_y)):
    exp_y2 += (value_y[i] ** 2) * np.sum(prob_matrix, axis=0)[i]
var_y = exp_y2 - (exp_y ** 2)
print("Variance of y: ", var_y)

# Covariance
exp_xy = 0
for i in range(len(value_x)):
    for j in range(len(value_y)):
        exp_xy += value_x[i] * value_y[j] * prob_matrix[i, j]
cov = exp_xy - exp_x * exp_y
print("Covariance: ", cov)

# Correlation
corr = cov / ((var_x * var_y) ** (1/2))
print("Correlation:  ", corr)        

Expectation of x:  1.79
Expectation of y:  2.59
Variance of x:  0.3858999999999999
Variance of y:  1.161900000000001
Covariance:  0.22389999999999954
Correlation:   0.33437386749732556


In [34]:
# Covariance for continuous random variables
x = symbols('x')
y = symbols('y')

# Inputs
func = 8 * x * y - 2 * x * (y ** 2)
domain_x = (0, 1)
domain_y = (1, 2)

# Expectation of x
exp_x = float(integrate(x * integrate(func, (y, domain_y[0], domain_y[1])), (x, domain_x[0], domain_x[1])))
print("Expectation of x: ", exp_x)

# Expectation of y
exp_y = float(integrate(y * integrate(func, (x, domain_x[0], domain_x[1])), (y, domain_y[0], domain_y[1]), (y, domain_y[0], domain_y[1])))
print("Expectation of y: ", exp_y)

# Variance of x
exp_x2 = float(integrate(x * x * integrate(func, (y, domain_y[0], domain_y[1])), (x, domain_x[0], domain_x[1])))
var_x = exp_x2 - exp_x ** 2
print("Variance of x: ", var_x)

# Variance of y
exp_y2 = float(integrate(y * y * integrate(func, (x, domain_x[0], domain_x[1])), (y, domain_y[0], domain_y[1])))
var_y = exp_y2 - exp_y ** 2
print("Variance of y: ", var_y)

# Covariance
exp_xy = float(integrate(x * y * func, (x, domain_x[0], domain_x[1]), (y, domain_y[0], domain_y[1])))
cov = exp_xy - exp_x * exp_y
print("Covariance: ", cov)

# Correlation
corr = cov/(var_x * var_y) ** (1/2)
print("Correlation:  ", corr)        

Expectation of x:  2.4444444444444446
Expectation of y:  5.583333333333333
Variance of x:  -4.141975308641976
Variance of y:  -22.373611111111106
Covariance:  -9.925925925925927
Correlation:   -1.0310963144439693


## Combinations and Functions of Random Variables

### Linear Functions of Random Variables

- Linear function of a random variable: given $X$ random variable and $Y = aX + b$ with $a, b \in \mathbb{R}$ then:
\begin{equation}
    E(Y) = aE(X) + b \text{  and  } Var(Y) = a^2Var(X)
\end{equation}
- Standardization: if $X$ has expectation $\mu$ and variance $\sigma^2$ then:
\begin{equation}
    Y = \frac{X - \mu}{\sigma}
\end{equation}
has an expectation of zero and variance of one

- Sums of Random variables: given two random variables $X_1$ and $X_2$ then
\begin{equation}
    E(X_1 + X_2) = E(X_1) + E(X_2)
\end{equation}
and
\begin{equation}
    Var(X_1 + X_2) = Var(X_1) + Var(X_2) + 2Cov(X_1,X_2)
\end{equation}
- If $X_1$ and $X_2$ are independent, then:
\begin{equation}
    Var(X_1 + X_2) = Var(X_1) + Var(X_2)
\end{equation}

### Linear Combinations of Random Variables

- Linear combinations of random variables: if $X_1, \cdots, X_n$ is a sequence of random variables and $a_1, \cdots, a_n$ and $b$ are constants then:
\begin{equation}
    E(a_1X_1 + \cdots + a_n X_n + b) = a_1 E(X_1) + \cdots +a_n E(X_n) + b
\end{equation}
- If the random variables are independent:
\begin{equation}
    Var(a_1X_1 + \cdots + a_nX_n + b) = a_1^2Var(X_1) + \cdots + a_n^2 Var(X_n)
\end{equation}

- Averaging independent random variables: $X_1, \cdots, X_n$ sequence of random variables with expectation $\mu$ and variance $\sigma^2$ we have:
$\bar{X} = \frac{\sum_{i=1}^n X_i}{n}$ then
    - $E(\bar{X}) = \mu$
    - $Var(\bar{X}) = \frac{\sigma^2}{n}$

### Nonlinear Functions of a Random Variable

- Nonlinear function of a random variable $X$: another random variable $Y=g(X)$ for some nonlinear function g