In [1]:
import numpy as np

#### Sample mean

$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$


In [2]:
# Sample data
data = [25, 30, 35, 40, 45]
sample_mean = sum(data)/len(data)
print(sample_mean)

# Calculate the sample mean without NumPy

35.0


In [4]:
import pandas as pd
(pd.Series(data)).mean()

35.0

In [5]:
# Calculate the sample mean using NumPy
np.mean(data)


35.0

### Sample variance

$$
s^2 = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$



In [13]:
# Calculate the sample variance without NumPy

S = 0
n = len(data)
for xi in data:
  print(xi)
  S += (xi-sample_mean)**2

sample_variance = S/(n-1)

print(sample_variance)

25
30
35
40
45
62.5


In [7]:
sigma = np.sqrt(sample_variance)
print(sigma)

7.905694150420948


In [8]:
(pd.Series(data)).var()

62.5

In [11]:
sum((pd.Series(data) - sample_mean)**2)/(n-1)

62.5

In [15]:
# Calculate variance using numpy ddof = 1

np.var(data, ddof = 1)

62.5

### Sample mode

Count the frequency of each distinct value in the dataset.
Identify the value(s) with the highest frequency.
The value(s) with the highest frequency is the sample mode.

In [24]:
#Calculate mode for data_2 = [1,1,4,5,8,1,10]

data_2 = [1,1,4,5,8,1,10,]
s = pd.Series(data_2)
s = s.value_counts()
print(s)


1     3
4     1
5     1
8     1
10    1
dtype: int64


In [20]:
mode = s.index[0]
print(mode)

1


In [22]:
from statistics import mode as mode_func
mode_func(data_2)

1

### Median

$$
\text{Median} =
\begin{cases}
\text{middle data point} & \text{if the number of data points is odd} \\
\frac{\text{middle data point} + \text{middle data point + 1}}{2} & \text{if the number of data points is even}
\end{cases}
$$

In [33]:
# Calculate the sample median without NumPy

def calc_median(my_list):
  n = len(my_list)
  is_even = (n%2==0)
  sorted_data = sorted(my_list)

  if is_even:
    print(n//2-1)
    print(n//2)
    left_middle_point = sorted_data[n//2 -1]
    right_middle_point = sorted_data[n//2]
    median_estimate = (left_middle_point+right_middle_point)/2

  else:

    median_estimate = sorted_data[n//2]


  return median_estimate


In [32]:
len(data)//2

2

In [34]:
calc_median(data)

35

In [35]:
calc_median([1,10,9,4])

1
2


6.5

In [None]:
# Calculate the sample median using NumPy

### Correlation of two variables


$$ r = \frac{\sum{(x_i - \bar{x})(y_i - \bar{y})}}{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}} $$

In [39]:
# Sample data for two variables

#Calculate the Pearson correlation coefficient r  without NumPy
variable1 = [1, 2, 3, 4, 5]
variable2 = [-2, -3, -4, -5, -6]

In [40]:
# Calculate the Pearson correlation coefficient without NumPy

x_mean = sum(variable1)/len(variable1)
y_mean = sum(variable2)/len(variable2)


upper_sum = 0
for xi, yi in zip(variable1, variable2):
  upper_sum += (xi-x_mean)*(yi-y_mean)


lower_x_sum = 0
for xi in variable1:
  lower_x_sum += (xi-x_mean)**2



lower_y_sum = 0
for yi in variable2:
  lower_y_sum += (yi-y_mean)**2

r = (upper_sum)/np.sqrt(lower_x_sum*lower_y_sum)
print(r)


-1.0


In [None]:
# Calculate the Pearson correlation coefficient with NumPy

In [41]:
#load iris data set and save it to dataframe

from sklearn import datasets

iris = datasets.load_iris()


In [43]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [45]:
iris["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [46]:
df = pd.DataFrame(iris["data"], columns = iris["feature_names"])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [49]:
# look in documentation for pandas function that returns correlation matrix
# what is the correlation between sepal lenght and petal length?

df.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
sepal length (cm),1.0,-0.11757,0.871754,0.817941
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126
petal length (cm),0.871754,-0.42844,1.0,0.962865
petal width (cm),0.817941,-0.366126,0.962865,1.0


In [47]:
#Calculate point estimates (mean, variance, median for all columns using padas build in functions)
print(df["sepal length (cm)"].mean())
print(df["sepal length (cm)"].std())
print(df["sepal length (cm)"].median())

5.843333333333334
0.828066127977863
5.8


In [None]:
# Can you think of df fuctnion we used on day 1 to give us this statistical overview?


In [48]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5
