In [1]:
import numpy as np
import heapq
from math import sqrt
from collections import Counter
from scipy import stats
import matplotlib.pyplot as plt
import statistics
from dataAnalysis import dataAnalysis

In [2]:
class dataAnalysis:
    def __init__(self, data, categories, classifications):
        self.data = data
        self.size = len(data)
        self.categories = categories
        self.classifications = classifications
    def printLabelStats(self):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Print statistics on every classification in data
        ===================================================================================
        '''
        print("=====================================================")
        print("Classification Analysis:")
        print("=====================================================")
        print(f"Total: {self.size} (%100)")
        
        for label in self.classifications:
            count = len(self.data[self.data[:, -1] == label])
            percentage = round(100*count/self.size,2)
            print(f"Value: {label}, Count: {count}, Percentage: %{percentage}")
            
    def printCategoryStats(self, max_ranges=10):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Print statistics on every attribute in data, and display broader count ranges if necessary
        ===================================================================================
        '''
        data = self.data
        categories = self.categories
        size = self.size
        labels = column = data[:, -1]
        print("=====================================================")
        print("Attribute Analysis:")
        print("=====================================================")
        for i in range(len(categories)):
            category = categories[i]
            column = data[:, i].astype(float)

            avg = np.mean(column)
            med = np.median(column)
            mode = float(stats.mode(column, keepdims=True)[0][0])
            std_dev = np.std(column)

            unique_values, counts = np.unique(column, return_counts=True)
            num_unique_values = len(unique_values)

            if num_unique_values > max_ranges:
                # Determine range width based on the number of unique values
                range_width = (np.max(unique_values) - np.min(unique_values)) / max_ranges
                range_counts = []
                range_start = unique_values[0]
                current_range_count = 0
                for value, count in zip(unique_values, counts):
                    if value - range_start <= range_width:
                        current_range_count += count
                    else:
                        range_counts.append((range_start, range_start + range_width, current_range_count))
                        range_start = value
                        current_range_count = count
                range_counts.append((range_start, range_start + range_width, current_range_count))

                print(f"-----------------------------------------------------")
                print(f"{category.upper()} Analysis (Count Ranges):")
                print(f"-----------------------------------------------------")

                for range_start, range_end, count in range_counts:
                    percentage = round(100 * count / size, 2)
                    print(f"Range: [{range_start} - {range_end}], Count: {count}, Percentage: %{percentage}")
            else:
                print(f"-----------------------------------------------------")
                print(f"{category.upper()} Analysis:")
                print(f"-----------------------------------------------------")
                for value, count in zip(unique_values, counts):
                    percentage = round(100 * count / size, 2)
                    print(f"Value: {value}, Count: {count}, Percentage: %{percentage}")

            print(f"\nMean: {avg}")
            print(f"Median: {med}")
            print(f"Mode: {mode}")
            print(f"Standard Deviation: {std_dev}")

            plt.figure(figsize=(4, 4))
            plt.bar(labels, column, edgecolor='black')
            plt.xlabel("classification")
            plt.ylabel(category)
            plt.title("Bar Chart of Numeric Data by Labels")
            plt.show()