<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/08_Okt_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder
import warnings
from typing import Union, List
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('train.csv')

In [9]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [10]:
class Correlation:
    def __init__(self,
                 x: np.ndarray | pd.Series | pd.DataFrame | List,
                 y: np.ndarray | pd.Series | pd.DataFrame | List,
                 none=False):

        self.x = self._check_input(x)
        self.y = self._check_input(y)

        self.x, self._x_classes_ = self._label_encoder(self.x)
        self.y, self._y_classes_ = self._label_encoder(self.y)

        # self.x = pd.to_numeric(self.x, errors='coerce')
        # self.y = pd.to_numeric(self.y, errors='coerce')

        check = ~np.isnan(self.x) & ~np.isnan(self.y)
        self.x = self.x[check]
        self.y = self.y[check]

        if self.x.shape != self.y.shape:
            raise ValueError(f"x and y must have the same shape (x={self.x.shape}, y={self.y.shape})")

        self.len = len(self.x)
        self.mean_x = self._mean(self.x)
        self.mean_y = self._mean(self.y)
        self.variance_x = self._variance(self.x, self.mean_x)
        self.variance_y = self._variance(self.y, self.mean_y)
        self.std_x = np.sqrt(self.variance_x)
        self.std_y = np.sqrt(self.variance_y)
        self.covariance = self._covariance()
        self.correlation = self._pearson_correlation()
        self.pearson_correlation = self._pearson_correlation()
        self.spearman_correlation = self._spearman_correlation()
        self.contingency_table = self._get_contingency_table(self.x, self.y)
        self.chi2_contingency = self._chi2_contingency()
        self.cramers_v = self._cramers_v()
        self.phi_coefficient = self._phi_coefficient()
        self.point_biserial = self._point_biserial()
        self.kendall_tau = self._kendall_tau()
        self.WoE = self._get_WoE()
        self.view_WoE = self._view_WoE()

    def _label_encoder(self, data) -> np.array:
        lbl = LabelEncoder()
        r = lbl.fit_transform(data)
        return r, lbl.classes_



    def _check_input(self, data) -> float:
        if isinstance(data, pd.DataFrame):
            if data.shape[1] > 1:
                print(f"Only the first column is taken from the DataFrame: ({data.shape[1]})")
            data = data.iloc[:, 0]
        return np.array(data)

    def _mean(self, data) -> float:
      try:
          return sum(data) / len(data)
      except ZeroDivisionError:
          return 0

    def _variance(self, data, mean) -> float:
        return sum((x - mean) ** 2 for x in data) / (len(data) - 1)

    def _covariance(self) -> float:
        c = sum((xi - self.mean_x) * (yi - self.mean_y) for xi, yi in zip(self.x, self.y)) \
            / (len(self.x) - 1)
        return c

    def _pearson_correlation(self) -> float:
        return self.covariance / (self.std_x * self.std_y)

    def _rank(self, data):
        sort_value = sorted((val, i) for i, val in enumerate(data))
        r = [0] * len(data)
        i = 0
        while i < len(sort_value):
            y, k = sort_value[i]
            same_value = [k]
            j = i + 1
            while j < len(sort_value) and sort_value[j][0] == y:
                same_value.append(sort_value[j][1])
                j += 1
            avg = (i + 1 + j) / 2.0
            for idx in same_value:
                r[idx] = avg
            i = j
        return r

    def _spearman_correlation(self) -> float:
        x = self._rank(self.x)
        y = self._rank(self.y)
        mean_x = self._mean(x)
        mean_y = self._mean(y)
        v_x = self._variance(x, mean_x)
        v_y = self._variance(y, mean_y)
        std_x = np.sqrt(v_x)
        std_y = np.sqrt(v_y)
        return sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y)) / (len(x) - 1) / (std_x * std_y)

    def _get_contingency_table(self, x, y) -> np.array:
        unique_x = np.unique(x)
        unique_y = np.unique(y)
        table = np.zeros((len(unique_x), len(unique_y)), dtype=int)
        for i in range(len(x)):
            row_idx = np.where(unique_x == x[i])[0][0]
            col_idx = np.where(unique_y == y[i])[0][0]
            table[row_idx, col_idx] += 1

        return table
    def _chi2_contingency(self) -> float:
        n = np.sum(self.contingency_table)
        ex = np.zeros_like(self.contingency_table, dtype=float)

        for i in range(len(self.contingency_table)):
            for j in range(len(self.contingency_table[i])):
                ex[i, j] = (np.sum(self.contingency_table[i, :]) * np.sum(self.contingency_table[:, j])) / n
        return np.sum(((self.contingency_table - ex) ** 2) / ex)

    def _cramers_v(self) -> float:
        return np.sqrt(self.chi2_contingency / (self.len * (min(self.contingency_table.shape) - 1)))

    def _phi_coefficient(self) -> float:
        if self.contingency_table.shape != (2, 2):
            return "Phi coefficient is only for binarry data"
        else:
          return (self.contingency_table[0, 0] * self.contingency_table[1, 1] - \
                  self.contingency_table[0, 1] * self.contingency_table[1, 0]) / \
                 np.sqrt((self.contingency_table[0, :] * self.contingency_table[:, 1]).sum() * \
                         (self.contingency_table[0, :] * self.contingency_table[:, 0]).sum())

    def _point_biserial(self):
      try:
          y1 = self.y[self.x == 1]
          y0 = self.y[self.x == 0]
          mean_y1 = np.mean(y1)
          mean_y0 = np.mean(y0)
          s_y = np.std(self.y, ddof=1)
          n1 = len(y1)
          n0 = len(y0)
          return ((mean_y1 - mean_y0) / s_y) * np.sqrt((n1 * n0) / self.len**2)
      except ZeroDivisionError:
          return 0

    def _kendall_tau(self):
      try:
        c = 0
        d = 0

        for i in range( - 1):
            for j in range(i + 1, self.len):
                if (self.x[i] - self.x[j]) * (self.y[i] - self.y[j]) > 0:
                    c += 1
                elif (self.x[i] - self.x[j]) * (self.y[i] - self.y[j]) < 0:
                    d += 1
        return (c - d) / (0.5 * self.len * (self.len - 1))
      except ZeroDivisionError:
          return 0

    def _get_WoE(self) -> pd.DataFrame:
        df = pd.DataFrame({'x': self.x, 'y': self.y})

        is_binary = df['y'].isin([0, 1]).all()
        if not is_binary:
            return 'Target (y) must be binary'

        df = df.groupby('x')['y'].value_counts().unstack(fill_value=0)

        if 1 in df.columns and 0 in df.columns:
            df = df.rename(columns={0: 'No', 1: 'Yes'})
        else:
            return ("The binary target values 0 and 1 are not correctly represented in the DataFrame.")

        df['Persentage event'] = df['Yes'] / df['Yes'].sum()
        df['Persentage non event'] = df['No'] / df['No'].sum()

        df['WOE'] = np.log(df['Persentage event'] / df['Persentage non event'].replace(0, np.nan))
        df['WOE'] = df['WOE'].replace(-np.inf, 0)
        df = df.sort_values('WOE', )
        df["rank"] = df['WOE'].rank()

        return df

    def _view_WoE(self) -> go.Figure:
        if not isinstance(self.WoE, str):
            x = self._x_classes_[self.WoE.index]
            count_category = self.WoE.shape[0]
            fig = go.Figure()
            fig.add_trace(go.Bar(
                x=x,
                y=self.WoE['WOE'],
                name='WOE'
            ))

            fig.add_trace(go.Scatter(
                  x=x,
                  y=self.WoE['WOE'],
                  name='Woet of Evedence',
                  mode='lines+markers',
                  marker=dict(color='red')
              ))
            if count_category <= 10:
              fig.add_trace(go.Scatter(
                  x=x,
                  y=self.WoE['rank'],
                  name='Rank of Evedence',
                  marker=dict(color='green')
              ))
            return fig
        else:
            return self.WoE


    def __str__(self) -> str:
        text = f"""
        Correlation: {self.correlation}
        Pearson correlation: {self.pearson_correlation}
        Spearman correlation: {self.spearman_correlation}
        Phi coefficient: {self.phi_coefficient}
        Cramer's V: {self.cramers_v}
        """
        return text

    def __repr__(self) -> str:
        return self.__str__()

    def __len__(self) -> int:
        return self.len

    def __getitem__(self, key) -> float:
        if key == "mean_x":
            return self.mean_x
        elif key == "mean_y":
            return self.mean_y
        elif key == "correlation":
            return self.correlation
        elif key == "pearson_correlation":
            return self.pearson_correlation
        elif key == "spearman_correlation":
            return self.spearman_correlation
        elif key == "phi_coefficient":
            return self.phi_coefficient
        elif key == "cramers_v":
            return self.cramers_v
        elif key == "contingency_table":
            return self.contingency_table
        elif key == "chi2_contingency":
            return self.chi2_contingency
        elif key == "point_biserial":
            return self.point_biserial
        elif key == "kendall_tau":
            return self.kendall_tau
        elif key == "WoE":
            return self.WoE
        elif key == "view_WoE":
            return self.view_WoE
        elif key == "x":
            return self.x
        elif key == "y":
            return self.y
        elif key == "variance_x":
            return self.variance_x
        elif key == "variance_y":
            return self.variance_y
        elif key == "std_x":
            return self.std_x
        elif key == "std_y":
            return self.std_y
        elif key == "covariance":
            return self.covariance
        elif key == "len":
            return self.len
        else:
            raise ValueError(f"Invalid key: {key}")

In [11]:
Correlation(df['Pclass'], df['Survived'])['WoE']

y,No,Yes,Persentage event,Persentage non event,WOE,rank
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,372,119,0.347953,0.677596,-0.666483,1.0
1,97,87,0.254386,0.176685,0.364485,2.0
0,80,136,0.397661,0.145719,1.003916,3.0


In [12]:
Correlation(df['Pclass'], df['Survived'])['view_WoE']

In [13]:
Correlation(df['Ticket'], df['Survived'])['WoE']

y,No,Yes,Persentage event,Persentage non event,WOE,rank
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
574,3,1,0.002924,0.005464,-0.625325,1.0
469,2,1,0.002924,0.003643,-0.219859,2.0
429,1,0,0.000000,0.001821,0.000000,213.0
428,1,0,0.000000,0.001821,0.000000,213.0
427,1,0,0.000000,0.001821,0.000000,213.0
...,...,...,...,...,...,...
663,0,1,0.002924,0.000000,,
669,0,1,0.002924,0.000000,,
670,0,1,0.002924,0.000000,,
672,0,1,0.002924,0.000000,,


In [14]:
Correlation(df['Ticket'], df['Survived'])['view_WoE']