In [125]:
import pandas as pd
import numpy as np

class DataSetStatistics:
    '''
    A class named as DataSetStatistics where the Statistical operations are created under this class.
    '''
    def __init__(self, data):
        ''' 
        Constructor (__init__)

        self.data = data.select_dtypes(include='number'): 
        This line filters the input data to keep number columns.

        Parameters:
        - 'select_dtype': The special function in pandas designed to filter the database on data type.
        - 'include='number'': The argument ensures that only columns with numerical data types such as float and integers are kept.

        self.length:
        Additionally, the number of rows, or the length of the filtered data, is computed and saved as the attribute self.length.
        This can be helpful for a number of procedures where knowing the size of the data is necessary.
        '''
        self.data = data.select_dtypes(include='number')
        self.length = len(self.data)

    def arithmetic_mean(self, column_name):
        '''
        Finding the mean for the given column.
        The working Structure of the first if statement:
        Extracts a specific column from a DataFrame,
        - Ensures that the data in that column is numerical,
        - Handles non-numeric values gracefully by converting them to NaN,
        - Cleans the data by removing rows with missing values,
        - Provides a reliable set of numerical data for subsequent operations.
        - Calculates the average of a set of numerical values.
        - Handles potential empty datasets or missing columns gracefully.
        - Returns either the calculated average, 0 (for empty datasets), or None (for missing columns),
                     providing clear indications of the outcome.
        '''
        if column_name in self.data.columns:
            column_data = self.data[column_name]
            numeric_data = pd.to_numeric(column_data, errors='coerce')
            valid_numeric_data = numeric_data.dropna()

            sum_val = 0
            count = 0
            for val in numeric_data:
                sum_val += val
                count += 1

            return sum_val / count if count > 0 else 0  
        else:
            return None 

    def geometric_mean(self, column_name):
        """
        Calculate the geometric mean of numeric values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float: The geometric mean of the numeric values in the specified column.
          Returns 0 if the column is empty or contains non-numeric values.
          Returns None if the specified column is not present in the DataFrame.
        """
        if column_name in self.data.columns:
            column_data = self.data[column_name]
            numeric_data = pd.to_numeric(column_data, errors='coerce')
            valid_numeric_data = numeric_data.dropna()
            product = np.prod(numeric_data)
            return (product ** (1 / len(numeric_data))) if len(numeric_data) > 0 else 0  
        else:
            return None  

    def harmonic_mean(self, column_name):
        """
        Calculate the harmonic mean of numeric values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float: The harmonic mean of the numeric values in the specified column.
          Returns 0 if the column contains 0 or non-numeric values.
          Returns None if the specified column is not present in the DataFrame.
        """
        if column_name in self.data.columns:
            column_data = self.data[column_name]
            numeric_data = pd.to_numeric(column_data, errors='coerce')
            valid_numeric_data = numeric_data.dropna()

            if 0 in numeric_data.values:
                return 0  
            return len(numeric_data) / sum(1 / num for num in numeric_data) if len(numeric_data) > 0 else 0
        else:
            return None

    def median(self, column_name):
        """
        Calculate the median of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float: The median value of the numeric values in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        if column_name in self.data.columns:
            column_data = sorted(self.data[column_name])
            n = len(column_data)
            if n % 2 == 0:
                median_value = (column_data[n // 2 - 1] + column_data[n // 2]) / 2
            else:
                median_value = column_data[n // 2]
            return median_value
        else:
            return None 

    def variance(self, column_name):
        """
        Calculate the variance of numeric values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float: The variance of the numeric values in the specified column.
          Returns 0 if the column is empty or contains non-numeric values.
          Returns None if the specified column is not present in the DataFrame.
        """
        if column_name in self.data.columns:
            column_data = self.data[column_name]
            numeric_data = pd.to_numeric(column_data, errors='coerce')
            valid_numeric_data = numeric_data.dropna()

            mean = self.arithmetic_mean(column_name)  
            sum_squared_diff = sum((x - mean) ** 2 for x in numeric_data)  

            return sum_squared_diff / len(numeric_data) if len(numeric_data) > 0 else 0  
        else:
            return None

    def mode(self, column_name):
        """
        Find the mode of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The mode value of the specified column. If there are multiple modes,
          the method returns the first encountered. Returns None if the specified column is
          not present in the DataFrame.
        """
        freq = {}
        for num in self.data[column_name]:
            if num not in freq:
                freq[num] = 1
            else:
                freq[num] += 1
        mode_value = max(freq, key=freq.get)
        return mode_value

    def standard_deviation(self, column_name):
        """
        Calculate the standard deviation of numeric values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float: The standard deviation of the numeric values in the specified column.
          Returns 0 if the column is empty or contains non-numeric values.
          Returns None if the specified column is not present in the DataFrame.
        """
        return self.variance(column_name) ** 0.5

    def max_value(self, column_name):
        """
        Find the maximum value in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The maximum value in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        return max(self.data[column_name])

    def min_value(self, column_name):
        """
        Find the minimum value in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The minimum value in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        return min(self.data[column_name])

    def range_value(self, column_name):
        """
        Calculate the range of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The range of values in the specified column (max - min).
          Returns None if the specified column is not present in the DataFrame.
        """
        max_value = max(self.data[column_name])
        min_value = min(self.data[column_name])

        range_value = max_value - min_value
        return range_value

    def iqr(self, column_name):
        """
        Calculate the Interquartile Range (IQR) of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The Interquartile Range (IQR) of values in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        values = list(self.data[column_name])
        n = len(values)
        for i in range(n):
            for j in range(0, n - i - 1):
                if values[j] > values[j + 1]:
                    values[j], values[j + 1] = values[j + 1], values[j]

        q1_index = n // 4
        q3_index = n * 3 // 4

        q1 = values[q1_index]
        q3 = values[q3_index]

        return q3 - q1

    def quartiles(self, column_name):
        """
        Calculate the Interquartile Range (IQR) of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The Interquartile Range (IQR) of values in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        column = self.data[column_name]
        q1_index = len(column) // 4
        q2_index = len(column) // 2
        q3_index = len(column) * 3 // 4

        q1 = column[q1_index]
        q2 = column[q2_index]
        q3 = column[q3_index]

        return q1, q2, q3

    def coefficient_of_range(self, column_name):
        """
        Calculate the quartiles (Q1, Q2, Q3) of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - tuple or None: A tuple containing the first quartile (Q1), the second quartile (Q2 or median),
          and the third quartile (Q3) of values in the specified column. Returns None if the specified
          column is not present in the DataFrame.
        """
        max_value = max(self.data[column_name])
        min_value = min(self.data[column_name])
        range_value = max_value - min_value
        coefficient_of_range_value = range_value / (max_value + min_value)
        return coefficient_of_range_value

    def coefficient_of_variation(self, column_name):
        """
        Calculate the Coefficient of Variation of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The Coefficient of Variation of values in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        return self.standard_deviation(column_name) / self.arithmetic_mean(column_name)

    def coefficient_of_standard_deviation(self, column_name):
        """
        Calculate the Coefficient of Standard Deviation of values in a specified column.

        Parameters:
        - column_name (str): The name of the column in the DataFrame.

        Returns:
        - float or None: The Coefficient of Standard Deviation of values in the specified column.
          Returns None if the specified column is not present in the DataFrame.
        """
        mean = sum(self.data[column_name]) / len(self.data[column_name])
        std_dev = (sum((x - mean) ** 2 for x in self.data[column_name]) / len(self.data[column_name])) ** 0.5
        max_value = max(self.data[column_name])
        min_value = min(self.data[column_name])
        range_value = max_value - min_value
        coefficient_of_std_dev_value = std_dev / range_value
        return coefficient_of_std_dev_value
    
    def covariance(self, column_name1, column_name2):
        '''
        Calculates the correlation between two columns in the DataFrame.

        Parameters:
        - column_name1 (str): The name of the first column.
        - column_name2 (str): The name of the second column.

        Returns:
        - float: The correlation coefficient between the two columns.
        '''
        mean_x = self.data[column_name1].mean()
        mean_y = self.data[column_name2].mean()

        covariation = ((self.data[column_name1] - mean_x) * (self.data[column_name2] - mean_y)).sum() / (len(self.data) - 1)
        return covariation
    
    def correlation(self, column_name1, column_name2):
        '''
        Calculate the correlation coefficient between two columns in the DataFrame.

    Parameters:
    - column_name1 (str): Name of the first column.
    - column_name2 (str): Name of the second column.

    Returns:
    float: The correlation coefficient between the specified columns.

    This method calculates the correlation coefficient between two columns using
    the formula:

    correlation = numerator / (denominator1**0.5 * denominator2**0.5)

    where:
    - numerator is the sum of the product of the differences between each value 
      and the mean of the respective columns,
    - denominator1 is the sum of the squared differences between each value 
      and the mean of the first column,
    - denominator2 is the sum of the squared differences between each value 
      and the mean of the second column.

    Note:
    The method assumes that the lengths of the two columns are the same.
        '''
        
        mean1 = self.data[column_name1].mean()
        mean2 = self.data[column_name2].mean()
    
        numerator = sum((self.data[column_name1][i] - mean1) * (self.data[column_name2][i] - mean2) for i in range(len(self.data)))
        denominator1 = sum((self.data[column_name1][i] - mean1)**2 for i in range(len(self.data)))
        denominator2 = sum((self.data[column_name2][i] - mean2)**2 for i in range(len(self.data)))
    
        correlation = numerator / (denominator1**0.5 * denominator2**0.5)
    
        return correlation


In [126]:
data = pd.read_csv(r'G:\Iris.csv')
statistics = DataSetStatistics(data)

statistics.arithmetic_mean('SepalWidthCm')

3.0540000000000007

In [127]:
statistics.geometric_mean('SepalWidthCm')

3.0235822036025914

In [128]:
statistics.harmonic_mean('SepalWidthCm')

2.9931367940540596

In [129]:
statistics.median('SepalWidthCm')

3.0

In [130]:
statistics.variance('SepalWidthCm')

0.1867506666666667

In [131]:
statistics.mode('SepalWidthCm')

3.0

In [132]:
statistics.standard_deviation('SepalWidthCm')

0.4321465800705435

In [133]:
statistics.max_value('SepalWidthCm')

4.4

In [134]:
statistics.min_value('SepalWidthCm')

2.0

In [135]:
statistics.range_value('SepalWidthCm')

2.4000000000000004

In [136]:
statistics.iqr('SepalWidthCm')

0.5

In [137]:
statistics.quartiles('SepalWidthCm')

(3.1, 3.0, 3.0)

In [138]:
statistics.coefficient_of_range('SepalWidthCm')

0.37500000000000006

In [139]:
statistics.coefficient_of_variation('SepalWidthCm')

0.14150182713508297

In [140]:
statistics.coefficient_of_standard_deviation('SepalWidthCm')

0.18006107502939311

In [141]:
statistics.covariance("SepalWidthCm", "PetalLengthCm")


-0.3217127516778523

In [142]:
statistics.correlation("SepalWidthCm", "PetalLengthCm")

-0.4205160964011544