<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/01_Okt_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [None]:
age = df['Age']
fare = df['Fare']

In [None]:
df[['Age', 'Fare']].corr()

Unnamed: 0,Age,Fare
Age,1.0,0.096067
Fare,0.096067,1.0


In [None]:
correlation = df['Age'].corr(df['Fare'])
print(correlation)

0.0960666917690389


In [78]:
class Correlation:
    def __init__(self,
                 x: np.ndarray | pd.Series | pd.DataFrame,
                 y: np.ndarray | pd.Series | pd.DataFrame):

        self.x = self._check_input(x)
        self.y = self._check_input(y)
        check = ~np.isnan(self.x) & ~np.isnan(self.y)
        self.x = self.x[check]
        self.y = self.y[check]

        if self.x.shape != self.y.shape:
            raise ValueError("x and y must have the same shape")

        self.len = len(self.x)
        self.mean_x = self._mean(self.x)
        self.mean_y = self._mean(self.y)
        self.variance_x = self._variance(self.x, self.mean_x)
        self.variance_y = self._variance(self.y, self.mean_y)
        self.std_x = np.sqrt(self.variance_x)
        self.std_y = np.sqrt(self.variance_y)
        self.covariance = self._covariance()
        self.correlation = self._pearson_correlation()
        self.pearson_correlation = self._pearson_correlation()
        self.spearman_correlation = self._spearman_correlation()

    def _check_input(self, data):
        if isinstance(data, pd.DataFrame):
            if data.shape[1] > 1:
                print(f"Only the first column is taken from the DataFrame: ({data.shape[1]})")
            data = data.iloc[:, 0]
        return np.array(data)

    def _mean(self, data):
        return sum(data) / len(data)

    def _variance(self, data, mean):
        return sum((x - mean) ** 2 for x in data) / (len(data) - 1)

    def _covariance(self):
        c = sum((xi - self.mean_x) * (yi - self.mean_y) for xi, yi in zip(self.x, self.y)) \
            / (len(self.x) - 1)
        return c

    def _pearson_correlation(self):
        return self.covariance / (self.std_x * self.std_y)

    def _rank(self, data):
        sorted_data = sorted((value, index) for index, value in enumerate(data))
        ranks = [0] * len(data)
        i = 0
        while i < len(sorted_data):
            value, index = sorted_data[i]
            same_value_indices = [index]
            j = i + 1
            while j < len(sorted_data) and sorted_data[j][0] == value:
                same_value_indices.append(sorted_data[j][1])
                j += 1
            average_rank = (i + 1 + j) / 2.0
            for idx in same_value_indices:
                ranks[idx] = average_rank
            i = j
        return ranks

    def _spearman_correlation(self):
        x = self._rank(self.x)
        y = self._rank(self.y)
        mean_x = self._mean(x)
        mean_y = self._mean(y)
        v_x = self._variance(x, mean_x)
        v_y = self._variance(y, mean_y)
        std_x = np.sqrt(v_x)
        std_y = np.sqrt(v_y)
        return sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y)) / (len(x) - 1) / (std_x * std_y)

Mean of x matches: 29.69911764705882
Mean of y differs: Correlation class = 34.69451400560218, pandas = 32.204207968574636
Variance of x matches: 211.0191247463081
Variance of y differs: Correlation class = 2800.413099695179, pandas = 2469.436845743117
Standard deviation of x matches: 14.526497332334044
Standard deviation of y differs: Correlation class = 52.9189295025436, pandas = 49.693428597180905
Covariance matches: 73.84902981461926
Pearson correlation matches: 0.09606669176903883
Spearman correlation matches: 0.13505121773428777


Test generated by AI

In [81]:
data = {
    'Age': [22, 38, 26, 35, np.nan, 28, 2, np.nan, 14, 4, 58, np.nan, 20, 39, 14, 55],
    'Fare': [7.25, 71.2833, 7.925, 53.1, 8.05, np.nan, 21.075, 11.1333, np.nan, 30.0708, 26.55, 8.05, 13.0, 31.275, 7.8542, 16.0]
}
df = pd.DataFrame(data)

corr_class = Correlation(df['Age'], df['Fare'])
mask = ~df['Age'].isna() & ~df['Fare'].isna()
df_filtered = df[mask]
mean_x_pandas = df_filtered['Age'].mean()
mean_y_pandas = df_filtered['Fare'].mean()
variance_x_pandas = df_filtered['Age'].var(ddof=1)
variance_y_pandas = df_filtered['Fare'].var(ddof=1)
std_x_pandas = df_filtered['Age'].std(ddof=1)
std_y_pandas = df_filtered['Fare'].std(ddof=1)
covariance_pandas = df_filtered['Age'].cov(df_filtered['Fare'])
pearson_corr_pandas = df_filtered['Age'].corr(df_filtered['Fare'], method='pearson')
spearman_corr_pandas = df_filtered['Age'].corr(df_filtered['Fare'], method='spearman')

print("\nComparing results between Correlation class and pandas (after filtering):\n")
if np.isclose(corr_class.mean_x, mean_x_pandas):
    print(f"Mean of x matches: {corr_class.mean_x}")
else:
    print(f"Mean of x differs: Correlation class = {corr_class.mean_x}, pandas = {mean_x_pandas}")
if np.isclose(corr_class.mean_y, mean_y_pandas):
    print(f"Mean of y matches: {corr_class.mean_y}")
else:
    print(f"Mean of y differs: Correlation class = {corr_class.mean_y}, pandas = {mean_y_pandas}")
if np.isclose(corr_class.variance_x, variance_x_pandas):
    print(f"Variance of x matches: {corr_class.variance_x}")
else:
    print(f"Variance of x differs: Correlation class = {corr_class.variance_x}, pandas = {variance_x_pandas}")
if np.isclose(corr_class.variance_y, variance_y_pandas):
    print(f"Variance of y matches: {corr_class.variance_y}")
else:
    print(f"Variance of y differs: Correlation class = {corr_class.variance_y}, pandas = {variance_y_pandas}")
if np.isclose(corr_class.std_x, std_x_pandas):
    print(f"Standard deviation of x matches: {corr_class.std_x}")
else:
    print(f"Standard deviation of x differs: Correlation class = {corr_class.std_x}, pandas = {std_x_pandas}")
if np.isclose(corr_class.std_y, std_y_pandas):
    print(f"Standard deviation of y matches: {corr_class.std_y}")
else:
    print(f"Standard deviation of y differs: Correlation class = {corr_class.std_y}, pandas = {std_y_pandas}")
if np.isclose(corr_class.covariance, covariance_pandas):
    print(f"Covariance matches: {corr_class.covariance}")
else:
    print(f"Covariance differs: Correlation class = {corr_class.covariance}, pandas = {covariance_pandas}")
if np.isclose(corr_class.pearson_correlation, pearson_corr_pandas):
    print(f"Pearson correlation matches: {corr_class.pearson_correlation}")
else:
    print(f"Pearson correlation differs: Correlation class = {corr_class.pearson_correlation}, pandas = {pearson_corr_pandas}")
if np.isclose(corr_class.spearman_correlation, spearman_corr_pandas):
    print(f"Spearman correlation matches: {corr_class.spearman_correlation}")
else:
    print(f"Spearman correlation differs: Correlation class = {corr_class.spearman_correlation}, pandas = {spearman_corr_pandas}")


Comparing results between Correlation class and pandas (after filtering):

Mean of x matches: 28.454545454545453
Mean of y matches: 25.94393636363636
Variance of x matches: 344.8727272727273
Variance of y matches: 413.04460297254536
Standard deviation of x matches: 18.570749238324428
Standard deviation of y matches: 20.323498787672985
Covariance matches: 98.43803181818183
Pearson correlation matches: 0.2608164744621913
Spearman correlation matches: 0.33636363636363636


In [82]:

corr_class = Correlation(df['Age'], df['Fare'])

mean_x_pandas = df['Age'].mean()
mean_y_pandas = df['Fare'].mean()
variance_x_pandas = df['Age'].var(ddof=1)
variance_y_pandas = df['Fare'].var(ddof=1)
std_x_pandas = df['Age'].std(ddof=1)
std_y_pandas = df['Fare'].std(ddof=1)
covariance_pandas = df['Age'].cov(df['Fare'])
pearson_corr_pandas = df['Age'].corr(df['Fare'], method='pearson')
spearman_corr_pandas = df['Age'].corr(df['Fare'], method='spearman')


if np.isclose(corr_class.mean_x, mean_x_pandas):
    print(f"Mean of x matches: {corr_class.mean_x}")
else:
    print(f"Mean of x differs: Correlation class = {corr_class.mean_x}, pandas = {mean_x_pandas}")

if np.isclose(corr_class.mean_y, mean_y_pandas):
    print(f"Mean of y matches: {corr_class.mean_y}")
else:
    print(f"Mean of y differs: Correlation class = {corr_class.mean_y}, pandas = {mean_y_pandas}")
if np.isclose(corr_class.variance_x, variance_x_pandas):
    print(f"Variance of x matches: {corr_class.variance_x}")
else:
    print(f"Variance of x differs: Correlation class = {corr_class.variance_x}, pandas = {variance_x_pandas}")
if np.isclose(corr_class.variance_y, variance_y_pandas):
    print(f"Variance of y matches: {corr_class.variance_y}")
else:
    print(f"Variance of y differs: Correlation class = {corr_class.variance_y}, pandas = {variance_y_pandas}")
if np.isclose(corr_class.std_x, std_x_pandas):
    print(f"Standard deviation of x matches: {corr_class.std_x}")
else:
    print(f"Standard deviation of x differs: Correlation class = {corr_class.std_x}, pandas = {std_x_pandas}")
if np.isclose(corr_class.std_y, std_y_pandas):
    print(f"Standard deviation of y matches: {corr_class.std_y}")
else:
    print(f"Standard deviation of y differs: Correlation class = {corr_class.std_y}, pandas = {std_y_pandas}")
if np.isclose(corr_class.covariance, covariance_pandas):
    print(f"Covariance matches: {corr_class.covariance}")
else:
    print(f"Covariance differs: Correlation class = {corr_class.covariance}, pandas = {covariance_pandas}")
if np.isclose(corr_class.pearson_correlation, pearson_corr_pandas):
    print(f"Pearson correlation matches: {corr_class.pearson_correlation}")
else:
    print(f"Pearson correlation differs: Correlation class = {corr_class.pearson_correlation}, pandas = {pearson_corr_pandas}")
if np.isclose(corr_class.spearman_correlation, spearman_corr_pandas):
    print(f"Spearman correlation matches: {corr_class.spearman_correlation}")
else:
    print(f"Spearman correlation differs: Correlation class = {corr_class.spearman_correlation}, pandas = {spearman_corr_pandas}")

Mean of x differs: Correlation class = 28.454545454545453, pandas = 27.307692307692307
Mean of y differs: Correlation class = 25.94393636363636, pandas = 22.329757142857144
Variance of x differs: Correlation class = 344.8727272727273, pandas = 303.39743589743586
Variance of y differs: Correlation class = 413.04460297254536, pandas = 369.79344740263736
Standard deviation of x differs: Correlation class = 18.570749238324428, pandas = 17.418307492332193
Standard deviation of y differs: Correlation class = 20.323498787672985, pandas = 19.230014233032627
Covariance matches: 98.43803181818183
Pearson correlation matches: 0.2608164744621913
Spearman correlation matches: 0.33636363636363636
