# Appendix: Statistics (Part 1)

## Descriptive Statistics

### Population vs. Sample (Price Returns for S&P 500 Companies in 2017)

The __S&P 500__, or just the S&P, is a __stock market index__ that measures the stock performance of __500 large companies__ listed on stock exchanges in the United States. It is one of the most commonly followed equity indices, and many consider it to be one of the best representations of the U.S. stock market. <br>
The S&P 500 is a __capitalization-weighted index__ and the performance of the __10 largest companies__ in the index account for __21.8% of the performance__ of the index. 

__What is the equally weighted return in 2017?__

In [1]:
import numpy as np
np.set_printoptions(precision=2, suppress= True)

__Population: 2017 Price Return for all 500 Companies__ 

In [2]:
pop = np.loadtxt("SP500_pop.csv", delimiter = ",", usecols = 1)

In [11]:
pop

array([ 46.99,  11.44, -41.05,  46.11,  54.44,  17.43,  66.32,  48.58,
        30.7 ,  70.22,  22.6 , -12.2 ,  14.02,  10.93,  41.64,  12.45,
        16.85,  -6.8 ,  26.12, -22.11,  -8.77,  -3.83,   8.59,  21.79,
        -2.46,  48.57, 131.13, -17.15,  41.27,  24.31,  -2.26,  58.41,
        11.01,  -9.35,  49.12,  41.26,  18.94,  52.76,  35.  ,  55.96,
       143.44,  59.57,  56.51,  20.15,  29.42, -33.48,  14.09,  30.65,
        50.27,  17.51,  46.98,  15.83,  75.35,   0.71,  45.33,  63.57,
        26.44,  34.06,  -9.93,  89.43,  33.57,  45.78,   5.74,  60.46,
        29.3 ,   9.47,  52.87, -51.3 ,  12.34,  13.68,  18.53,  35.  ,
         0.85,   4.86,  36.62,  21.62,  14.61,  29.54,   3.38,  25.21,
        -4.75, -14.87,  69.92,  10.6 ,  68.62,  37.54,  -7.26,  27.94,
        27.49,  65.82,  33.4 ,  35.99,  -9.84,  42.26,  35.13,  17.82,
        13.53,  21.61,  16.69,  52.25,  -1.03,  15.3 ,  23.93,  27.46,
        15.99,  26.61, -23.4 ,  29.25,  13.65,  78.55,  15.1 ,  14.14,
      

In [4]:
pop = pop * 100

In [5]:
pop.size

500

__Sample: 2017 Price Return for 50 Companies (randomly selected)__ 

In [6]:
sample = np.loadtxt("sample.csv", delimiter = ",", usecols = 1)

In [7]:
sample = sample * 100

In [8]:
sample

array([ 41.58,  15.94,  48.01,  57.52,  -3.45,  13.84,  17.43,  37.18,
        34.25,   6.7 ,   9.82,  -1.15,  49.12,  10.66,  94.88, -12.2 ,
         4.72,  -7.34,  29.24,  70.64,  42.26,  38.17, -12.32,  24.01,
        19.24,  25.21,  -0.22,  89.43,  -9.65,  20.29,   9.16,  37.54,
        10.4 ,  18.94, -12.75,  14.83,  24.39,  23.06,  -9.35, 132.3 ,
        11.51,  70.22,  35.58,  -0.84,  50.27,  26.26,  12.45,  -9.33,
        -2.46,  69.92])

In [9]:
sample.size

50

In [10]:
for i in sample:
    print(i in pop)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [13]:
np.isin(sample, pop)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

### Visualizing Frequency Distributions with plt.hist()

In [12]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress= True)

In [None]:
pop

In [None]:
sample

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 75)
plt.title("Absolute Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Absolute Frequency", fontsize = 15)
plt.xticks(np.arange(-100, 401, 25))
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(sample, bins = 15)
plt.title("Absolute Frequencies - Sample", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Absolute Frequency", fontsize = 15)
plt.show()

### Relative and Cumulative Frequencies with plt.hist()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress= True)

In [None]:
pop.size

In [None]:
sample.size

In [None]:
(np.ones(len(pop)) / len(pop))

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 75, weights = np.ones(len(pop)) / len(pop))
plt.title("Relative Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Relative Frequency", fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 50, density = True)
plt.title("Relative Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Relative Frequency", fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 50, density = False, cumulative = True)
plt.title("Cumulative Absolute Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Cumulative Absolute Frequency", fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 50, density = True, cumulative = True)
plt.title("Cumulative Relative Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Cumulative Relative Frequency", fontsize = 15)
plt.show()

### Measures of Central Tendency - Mean and Median

In [None]:
import numpy as np
np.set_printoptions(precision=4, suppress= True)

__Population Mean__

In [None]:
pop

In [None]:
pop.mean()

In [None]:
np.mean(pop)

__Sample Mean__

In [None]:
sample

In [None]:
sample.mean()

__Median__

In [None]:
np.median(pop)

In [None]:
np.median(sample)

In [None]:
sample.sort()

In [None]:
(sample[24] + sample[25]) / 2

### Measures of Central Tendency - Geometric Mean

In [None]:
import numpy as np
np.set_printoptions(precision=4, suppress= True)

In [None]:
Price_2015_2018 = np.array([100, 107, 102, 110])

In [None]:
ret = Price_2015_2018[1:] / Price_2015_2018[:-1] - 1
ret

In [None]:
mean = ret.mean()
mean

In [None]:
100 * (1 + mean)**3

In [None]:
geo_mean = (1 + ret).prod()**(1/ret.size) - 1
geo_mean

In [None]:
100 * (1 + geo_mean)**3

In [None]:
(110 / 100)**(1/ret.size) - 1 

### Range, Minimum and Maximum

In [None]:
import numpy as np
np.set_printoptions(precision=4, suppress= True)

In [None]:
pop = np.loadtxt("SP500_pop.csv", delimiter = ",", usecols = 1)

In [None]:
pop = pop * 100

In [None]:
pop

In [None]:
pop.size

In [None]:
pop.max()

In [None]:
pop.min()

In [None]:
range = pop.ptp()
range

In [None]:
pop.max() - pop.min()

In [None]:
sample = np.loadtxt("sample.csv", delimiter = ",", usecols = 1)

In [None]:
sample = sample * 100

In [None]:
sample

In [None]:
sample.size

In [None]:
np.max(sample)

In [None]:
np.min(sample)

In [None]:
sample.ptp()

### Variance and Standard Deviation

In [None]:
import numpy as np
np.set_printoptions(precision=4, suppress= True)

In [None]:
pop

In [None]:
sample

__Population Variance__

In [None]:
pop.var()

In [None]:
np.var(pop)

__Sample Variance__

In [None]:
np.var(sample)

In [None]:
np.var(sample, ddof = 1)

__Standard Deviation__

In [None]:
np.sqrt(pop.var())

In [None]:
pop.std()

In [None]:
np.sqrt(np.var(sample, ddof = 1))

In [None]:
sample.std(ddof = 1)

### Percentiles

In [None]:
import numpy as np
np.set_printoptions(precision=4, suppress= True)

In [None]:
pop

In [None]:
np.percentile(pop, 50)

In [None]:
np.median(pop)

In [None]:
np.percentile(pop, 5)

In [None]:
np.percentile(pop, 95)

In [None]:
np.percentile(pop, [25, 75])

In [None]:
np.percentile(pop, [5, 95])

In [None]:
np.percentile(pop, [2.5, 97.5])

### How to calculate Skew & Kurtosis with scipy

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress= True)

In [None]:
pop = np.loadtxt("SP500_pop.csv", delimiter = ",", usecols = 1)

In [None]:
pop = pop * 100

In [None]:
pop

In [None]:
plt.figure(figsize = (12, 8))
plt.hist(pop, bins = 75)
plt.title("Absolute Frequencies - Population", fontsize = 20)
plt.xlabel("Stock Returns 2017 (in %)", fontsize = 15)
plt.ylabel("Absolute Frequency", fontsize = 15)
plt.xticks(np.arange(-100, 401, 25))
plt.show()

In [None]:
import scipy.stats as stats

__Skew__

In [None]:
stats.skew(pop)

__Kurtosis__

In [None]:
stats.kurtosis(pop, fisher = True)

In [None]:
stats.kurtosis(pop, fisher = False)