In [None]:
import numpy as np

#### Sample mean

$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$


In [None]:
# Sample data
data = [25, 30, 35, 40, 45]

# Calculate the sample mean without NumPy
mean_estimate = sum(data) / len(data)
print("Sample Mean (Point Estimate):", mean_estimate)


In [None]:
# Calculate the sample mean using NumPy
mean_estimate = np.mean(data)
print("Sample Mean (Point Estimate):", mean_estimate)


### Sample variance

$$
s^2 = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$



In [None]:
# Calculate the sample variance without NumPy
mean = sum(data) / len(data)
variance_estimate = sum((x - mean) ** 2 for x in data) / (len(data) - 1)
print("Sample Variance (Point Estimate):", variance_estimate)


In [None]:
#Calculate variance using numpy ddof = 1
variance_estimate = np.var(data, ddof=1)
print("Sample Variance (Point Estimate):", variance_estimate)

### Sample mode

Count the frequency of each distinct value in the dataset.
Identify the value(s) with the highest frequency.
The value(s) with the highest frequency is the sample mode.

In [None]:
#Calculate mode for data_2 = [1,1,4,5,8,1,10]
data_2 = [1,1,4,5,8,1,10]


from statistics import mode
mode_estimate = mode(data_2)
print("Sample Mode (Point Estimate):", mode_estimate)


import pandas as pd
s = pd.Series(data_2)
print(s)
mode_estimate = (s.value_counts()).index[0]
print("Sample Mode (Point Estimate):", mode_estimate)


### Median

$$
\text{Median} = 
\begin{cases}
\text{middle data point} & \text{if the number of data points is odd} \\
\frac{\text{middle data point} + \text{middle data point + 1}}{2} & \text{if the number of data points is even}
\end{cases}
$$

In [None]:
# Calculate the sample median without NumPy
sorted_data = sorted(data)
n = len(sorted_data)
if n % 2 == 0:
    median_estimate = (sorted_data[n // 2 - 1] + sorted_data[n // 2]) / 2
else:
    median_estimate = sorted_data[n // 2]
print("Sample Median (Point Estimate):", median_estimate)


In [None]:
# Calculate the sample median using NumPy
median_estimate = np.median(data)
print("Sample Median (Point Estimate):", median_estimate)

### Correlation of two variables


$$ r = \frac{\sum{(x_i - \bar{x})(y_i - \bar{y})}}{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}} $$

In [None]:
# Sample data for two variables

#Calculate the Pearson correlation coefficient r  without NumPy
variable1 = [1, 2, 3, 4, 5]
variable2 = [2, 3, 4, 5, 6]

In [None]:
# Calculate the Pearson correlation coefficient without NumPy

# Calculate the means of both variables
mean_variable1 = sum(variable1) / len(variable1)
mean_variable2 = sum(variable2) / len(variable2)

# Calculate the sum of the products of the differences

total_sum = 0
for x, y in zip(variable1,variable2):
    total_sum += (x-mean_variable1)*(y-mean_variable2)


# Calculate the sum of squared differences for each variable

x_sum = 0
y_sum = 0
for x, y in zip(variable1,variable2):
    x_sum += (x-mean_variable1)**2
    y_sum += (y-mean_variable2)**2


# Calculate the Pearson correlation coefficient
correlation_coefficient = total_sum / ( (x_sum ** 0.5) * (y_sum ** 0.5) )

print("Pearson Correlation Coefficient:", correlation_coefficient)



#alternative way of shortening while loop

total_sum = sum((variable1[i] - mean_variable1) * (variable2[i] - mean_variable2) for i in range(n))

# Calculate the sum of squared differences for each variable
x_sum = sum((variable1[i] - mean_variable1) ** 2 for i in range(n))
y_sum = sum((variable2[i] - mean_variable2) ** 2 for i in range(n))


# Calculate the Pearson correlation coefficient
correlation_coefficient = total_sum / ( (x_sum ** 0.5) * (y_sum ** 0.5) )

print("Pearson Correlation Coefficient:", correlation_coefficient)


In [None]:
correlation_matrix = np.corrcoef(variable1, variable2)
print(correlation_matrix)

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
iris = pd.DataFrame(iris["data"], columns = iris["feature_names"])
iris.head()

In [None]:
# look in documentation for pandas function that returns correlation matrix
# what is the correlation between sepal lenght and petal length?
iris.corr()

In [None]:
#Calculate point estimates (mean, variance, median for all columns using padas build in functions)

for col in iris.columns:
    print(col)
    print(iris[col].mean())
    print(iris[col].var())
    print(iris[col].std())
    print(iris[col].median())


In [None]:
# Can you think of df fuctnion we used on day 1 to give us this statistical overview?

iris.describe()