In [1]:
import numpy as np

#### Sample mean

$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$


In [2]:
# Sample data
data = [25, 30, 35, 40, 45]

# Calculate the sample mean without NumPy
mean_estimate = sum(data) / len(data)
print("Sample Mean (Point Estimate):", mean_estimate)


Sample Mean (Point Estimate): 35.0


In [3]:
# Calculate the sample mean using NumPy
mean_estimate = np.mean(data)
print("Sample Mean (Point Estimate):", mean_estimate)


Sample Mean (Point Estimate): 35.0


### Sample variance

$$
s^2 = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$



In [4]:
# Calculate the sample variance without NumPy
mean = sum(data) / len(data)
variance_estimate = sum((x - mean) ** 2 for x in data) / (len(data) - 1)
print("Sample Variance (Point Estimate):", variance_estimate)


Sample Variance (Point Estimate): 62.5


In [5]:
#Calculate variance using numpy ddof = 1
variance_estimate = np.var(data, ddof=1)
print("Sample Variance (Point Estimate):", variance_estimate)

Sample Variance (Point Estimate): 62.5


### Sample mode

Count the frequency of each distinct value in the dataset.
Identify the value(s) with the highest frequency.
The value(s) with the highest frequency is the sample mode.

In [6]:
#Calculate mode for data_2 = [1,1,4,5,8,1,10]
data_2 = [1,1,4,5,8,1,10]


from statistics import mode
mode_estimate = mode(data_2)
print("Sample Mode (Point Estimate):", mode_estimate)


import pandas as pd
s = pd.Series(data_2)
print(s)
mode_estimate = (s.value_counts()).index[0]
print("Sample Mode (Point Estimate):", mode_estimate)


Sample Mode (Point Estimate): 1
0     1
1     1
2     4
3     5
4     8
5     1
6    10
dtype: int64
Sample Mode (Point Estimate): 1


### Median

$$
\text{Median} = 
\begin{cases}
\text{middle data point} & \text{if the number of data points is odd} \\
\frac{\text{middle data point} + \text{middle data point + 1}}{2} & \text{if the number of data points is even}
\end{cases}
$$

In [7]:
# Calculate the sample median without NumPy
sorted_data = sorted(data)
n = len(sorted_data)
if n % 2 == 0:
    median_estimate = (sorted_data[n // 2 - 1] + sorted_data[n // 2]) / 2
else:
    median_estimate = sorted_data[n // 2]
print("Sample Median (Point Estimate):", median_estimate)


Sample Median (Point Estimate): 35


In [8]:
# Calculate the sample median using NumPy
median_estimate = np.median(data)
print("Sample Median (Point Estimate):", median_estimate)

Sample Median (Point Estimate): 35.0


### Correlation of two variables


$$ r = \frac{\sum{(x_i - \bar{x})(y_i - \bar{y})}}{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}} $$

In [9]:
# Sample data for two variables

#Calculate the Pearson correlation coefficient r  without NumPy
variable1 = [1, 2, 3, 4, 5]
variable2 = [2, 3, 4, 5, 6]

In [10]:
# Calculate the Pearson correlation coefficient without NumPy

# Calculate the means of both variables
mean_variable1 = sum(variable1) / len(variable1)
mean_variable2 = sum(variable2) / len(variable2)

# Calculate the sum of the products of the differences

total_sum = 0
for x, y in zip(variable1,variable2):
    total_sum += (x-mean_variable1)*(y-mean_variable2)


# Calculate the sum of squared differences for each variable

x_sum = 0
y_sum = 0
for x, y in zip(variable1,variable2):
    x_sum += (x-mean_variable1)**2
    y_sum += (y-mean_variable2)**2


# Calculate the Pearson correlation coefficient
correlation_coefficient = total_sum / ( (x_sum ** 0.5) * (y_sum ** 0.5) )

print("Pearson Correlation Coefficient:", correlation_coefficient)



#alternative way of shortening while loop

total_sum = sum((variable1[i] - mean_variable1) * (variable2[i] - mean_variable2) for i in range(n))

# Calculate the sum of squared differences for each variable
x_sum = sum((variable1[i] - mean_variable1) ** 2 for i in range(n))
y_sum = sum((variable2[i] - mean_variable2) ** 2 for i in range(n))


# Calculate the Pearson correlation coefficient
correlation_coefficient = total_sum / ( (x_sum ** 0.5) * (y_sum ** 0.5) )

print("Pearson Correlation Coefficient:", correlation_coefficient)


Pearson Correlation Coefficient: 0.9999999999999998
Pearson Correlation Coefficient: 0.9999999999999998


In [11]:
correlation_matrix = np.corrcoef(variable1, variable2)
print(correlation_matrix)

[[1. 1.]
 [1. 1.]]


In [12]:
from sklearn import datasets

iris = datasets.load_iris()
iris = pd.DataFrame(iris["data"], columns = iris["feature_names"])
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [13]:
# look in documentation for pandas function that returns correlation matrix
# what is the correlation between sepal lenght and petal length?
iris.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
sepal length (cm),1.0,-0.11757,0.871754,0.817941
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126
petal length (cm),0.871754,-0.42844,1.0,0.962865
petal width (cm),0.817941,-0.366126,0.962865,1.0


In [14]:
#Calculate point estimates (mean, variance, median for all columns using padas build in functions)

for col in iris.columns:
    print(col)
    print(iris[col].mean())
    print(iris[col].var())
    print(iris[col].std())
    print(iris[col].median())


sepal length (cm)
5.843333333333334
0.6856935123042507
0.828066127977863
5.8
sepal width (cm)
3.0573333333333337
0.189979418344519
0.4358662849366982
3.0
petal length (cm)
3.7580000000000005
3.116277852348993
1.7652982332594662
4.35
petal width (cm)
1.1993333333333336
0.5810062639821029
0.7622376689603465
1.3


In [15]:
# Can you think of df fuctnion we used on day 1 to give us this statistical overview?

iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5
