<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/03_Okt_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

Phi coefficient va Cramer's V uchun alohida faqat dataframe va 2 ta ustun qabul qiluvchi functsiyani yasash. Funktsiyalar uchun pandas va math kutubxonalaridan boshqalarni ishlatmaslik kerak. Tekshirish uchun bemalol tayyor funktsiyalardan foydalaninglar

In [20]:
n = 10000
binary_data = np.random.randint(2, size=(n, 2))
categories = np.random.choice([5, 8, 10], size=n)
numerical_data = np.random.randint(10, 100, size=n)
categorical_data = np.column_stack((categories, numerical_data))

In [21]:
class Correlation:
    def __init__(self,
                 x: np.ndarray | pd.Series | pd.DataFrame,
                 y: np.ndarray | pd.Series | pd.DataFrame,
                 none=False):

        self.x = self._check_input(x)
        self.y = self._check_input(y)

        check = ~np.isnan(self.x) & ~np.isnan(self.y)
        self.x = self.x[check]
        self.y = self.y[check]

        if self.x.shape != self.y.shape:
            raise ValueError("x and y must have the same shape")

        self.len = len(self.x)
        self.mean_x = self._mean(self.x)
        self.mean_y = self._mean(self.y)
        self.variance_x = self._variance(self.x, self.mean_x)
        self.variance_y = self._variance(self.y, self.mean_y)
        self.std_x = np.sqrt(self.variance_x)
        self.std_y = np.sqrt(self.variance_y)
        self.covariance = self._covariance()
        self.correlation = self._pearson_correlation()
        self.pearson_correlation = self._pearson_correlation()
        self.spearman_correlation = self._spearman_correlation()
        self.contingency_table = self._get_contingency_table(self.x, self.y)
        self._chi2_contingency = self._chi2_contingency()
        self.cramers_v = self._cramers_v()
        self.phi_coefficient = self._phi_coefficient()



    def _check_input(self, data) -> float:
        if isinstance(data, pd.DataFrame):
            if data.shape[1] > 1:
                print(f"Only the first column is taken from the DataFrame: ({data.shape[1]})")
            data = data.iloc[:, 0]
        return np.array(data)

    def _mean(self, data) -> float:
        return sum(data) / len(data)

    def _variance(self, data, mean) -> float:
        return sum((x - mean) ** 2 for x in data) / (len(data) - 1)

    def _covariance(self) -> float:
        c = sum((xi - self.mean_x) * (yi - self.mean_y) for xi, yi in zip(self.x, self.y)) \
            / (len(self.x) - 1)
        return c

    def _pearson_correlation(self) -> float:
        return self.covariance / (self.std_x * self.std_y)

    def _rank(self, data):
        sort_value = sorted((val, i) for i, val in enumerate(data))
        r = [0] * len(data)
        i = 0
        while i < len(sort_value):
            y, k = sort_value[i]
            same_value = [k]
            j = i + 1
            while j < len(sort_value) and sort_value[j][0] == y:
                same_value.append(sort_value[j][1])
                j += 1
            avg = (i + 1 + j) / 2.0
            for idx in same_value:
                r[idx] = avg
            i = j
        return r

    def _spearman_correlation(self) -> float:
        x = self._rank(self.x)
        y = self._rank(self.y)
        mean_x = self._mean(x)
        mean_y = self._mean(y)
        v_x = self._variance(x, mean_x)
        v_y = self._variance(y, mean_y)
        std_x = np.sqrt(v_x)
        std_y = np.sqrt(v_y)
        return sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y)) / (len(x) - 1) / (std_x * std_y)

    def _get_contingency_table(self, x, y) -> np.array:
        unique_x = np.unique(x)
        unique_y = np.unique(y)
        table = np.zeros((len(unique_x), len(unique_y)), dtype=int)
        for i in range(len(x)):
            row_idx = np.where(unique_x == x[i])[0][0]
            col_idx = np.where(unique_y == y[i])[0][0]
            table[row_idx, col_idx] += 1

        return table
    def _chi2_contingency(self) -> float:
        n = np.sum(self.contingency_table)
        ex = np.zeros_like(self.contingency_table, dtype=float)

        for i in range(len(self.contingency_table)):
            for j in range(len(self.contingency_table[i])):
                ex[i, j] = (np.sum(self.contingency_table[i, :]) * np.sum(self.contingency_table[:, j])) / n
        return np.sum(((self.contingency_table - ex) ** 2) / ex)

    def _cramers_v(self) -> float:
        return np.sqrt(self._chi2_contingency / (self.len * (min(self.contingency_table.shape) - 1)))

    def _phi_coefficient(self) -> float:
        if self.contingency_table.shape != (2, 2):
            return "Phi coefficient is only for binarry data"
        else:
          return (self.contingency_table[0, 0] * self.contingency_table[1, 1] - \
                  self.contingency_table[0, 1] * self.contingency_table[1, 0]) / \
                 np.sqrt((self.contingency_table[0, :] * self.contingency_table[:, 1]).sum() * \
                         (self.contingency_table[0, :] * self.contingency_table[:, 0]).sum())

    def __str__(self) -> str:
        text = f"""
        Correlation: {self.correlation}
        Pearson correlation: {self.pearson_correlation}
        Spearman correlation: {self.spearman_correlation}
        Phi coefficient: {self.phi_coefficient}
        Cramer's V: {self.cramers_v}
        """
        return text

    def __repr__(self) -> str:
        return self.__str__()


Correlation(binary_data[:, 0], binary_data[:, 1])


        Correlation: 0.017789395327598437
        Pearson correlation: 0.017789395327598437
        Spearman correlation: 0.017789395327595568
        Phi coefficient: 0.03544833349950903
        Cramer's V: 0.01778939532759555
        

In [22]:
Correlation(categorical_data[:, 0], categorical_data[:, 1])


        Correlation: -0.01204775686018055
        Pearson correlation: -0.01204775686018055
        Spearman correlation: -0.012394403285048514
        Phi coefficient: Phi coefficient is only for binarry data
        Cramer's V: 0.10355156776852849
        