In [1]:
import numpy as np

#### Sample mean

$$\bar{x} = \frac{1}{n} \sum_{i=1}^{n} x_i$$


In [4]:
# Sample data
data = [25, 30, 35, 40, 45]

# Calculate the sample mean without NumPy
mean = sum(data)/len(data)
mean

35.0

In [5]:
# Calculate the sample mean using NumPy

np_mean = np.mean(data)
np_mean

35.0

### Sample variance

$$
s^2 = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})^2
$$



In [13]:
for i in data:
    print(i)

25
30
35
40
45


In [16]:
# Calculate the sample variance without NumPy
mean_centered_data = [(i - int(mean))**2 for i in data]
variance = sum(mean_centered_data)/(len(data)-1)
variance

62.5

In [17]:
#Calculate variance using numpy ddof = 1
variance_np = np.var(data, ddof=1)
variance_np

62.5

### Sample mode

Count the frequency of each distinct value in the dataset.
Identify the value(s) with the highest frequency.
The value(s) with the highest frequency is the sample mode.

In [48]:
#Calculate mode for data_2 = [1,1,4,5,8,1,10]
data_2 = [1,1,4,5,8,1,10]
frequency_dict={i: data_2.count(i) for i in data_2} #will count how many times each element occurs in the list using list.count(i) method
mode_ = max(frequency_dict, key=frequency_dict.get)
frequency = max(frequency_dict.values())
mode_, frequency


(1, 3)

In [None]:
#using scipy
import scipy
scipy.stats.mode(data_2)

ModeResult(mode=1, count=3)

In [None]:
# Find the mode using the count method #chatgpt
mode_ = max(set(data_2), key=data_2.count)
print(f"Mode: {mode_}, Frequency: {data_2.count(mode_)}")

### Median

$$
\text{Median} = 
\begin{cases}
\text{middle data point} & \text{if the number of data points is odd} \\
\frac{\text{middle data point} + \text{middle data point + 1}}{2} & \text{if the number of data points is even}
\end{cases}
$$

In [None]:
# Calculate the sample median without NumPy
data = [10,20,12,16,18,37]
data.sort() #does not produce an empty list. Instead, 
#it sorts the list data in place and returns None. 
# This means data itself will now be sorted, but data.sort() does not produce any output if you try to print it directly.

# Get the middle index
middle_index = len(data) // 2

if len(data)%2 ==0:
    print('no of data points is even')
    median = (data[middle_index]+data[middle_index-1])/2
else: 
    print('no of data points is odd')
    median = data[middle_index]

print(median)
print(middle_index)


no of data points is even
17.0
3


In [99]:
# Calculate the sample median using NumPy
median_np = np.median(data)
median_np

17.0

### Correlation of two variables


$$ r = \frac{\sum{(x_i - \bar{x})(y_i - \bar{y})}}{\sqrt{\sum{(x_i - \bar{x})^2} \cdot \sum{(y_i - \bar{y})^2}}} $$

In [20]:
# Sample data for two variables

#Calculate the Pearson correlation coefficient r  without NumPy
variable1 = [1, 2, 3, 4, 5]
variable2 = [2, 3, 4, 5, 6]
mean_var1 = sum(variable1)/len(variable1)
mean_var2 = sum(variable2)/len(variable2)
mean_centered_variable1 = [(i-mean_var1) for i in variable1]
mean_centered_variable2 = [(j-mean_var2) for j in variable2]

multi_var1_var2 = [(mean_centered_variable1[i]*mean_centered_variable2[i]) for i in range(len(mean_centered_variable1))]

numerator = sum(multi_var1_var2)

sum_mean_cen_var1 = sum([i**2 for i in mean_centered_variable1])
sum_mean_cen_var2 = sum([j**2 for j in mean_centered_variable2])

denominator = (sum_mean_cen_var1*sum_mean_cen_var2)**(1/2)

correlation = numerator/denominator
correlation

1.0

In [23]:
# Calculate the Pearson correlation coefficient with NumPy
import numpy as np
correlation_np = np.corrcoef(variable1,variable2)
correlation_np

array([[1., 1.],
       [1., 1.]])

In [None]:
#load iris data set and save it to dataframe

from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()


{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [33]:
# look in documentation for pandas function that returns correlation matrix


df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['petal length (cm)']

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: petal length (cm), Length: 150, dtype: float64

In [38]:
# what is the correlation between sepal lenght and petal length?
correlation_Table = df.corr(method='pearson')
corr_bw_sep_n_pet_length = correlation_Table.iloc[0,2]

In [42]:
#Calculate point estimates (mean, variance, median for all columns using padas build in functions)
mean = df.mean()
median = df.median()
variance = df.var()

mean, median, variance

(sepal length (cm)    5.843333
 sepal width (cm)     3.057333
 petal length (cm)    3.758000
 petal width (cm)     1.199333
 dtype: float64,
 sepal length (cm)    5.80
 sepal width (cm)     3.00
 petal length (cm)    4.35
 petal width (cm)     1.30
 dtype: float64,
 sepal length (cm)    0.685694
 sepal width (cm)     0.189979
 petal length (cm)    3.116278
 petal width (cm)     0.581006
 dtype: float64)

In [43]:
# Can you think of df fuctnion we used on day 1 to give us this statistical overview?

df.describe()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5
