In [None]:
# Compilation
# uncomment the line above if you want to run this script directly
# !make

# uncomment the following lines to use the virtual environment
# !make venv
# !source venv/bin/activate

In [1]:
import os 
import numpy 
import matplotlib.pyplot as plt

import scientific_toolbox.stats as stats
from scientific_toolbox.stats import StatisticsAnalyzer, Dataset

In [2]:
# Instantiate the dataset
dataset = stats.Dataset()


sample_data = [
    {"Height": 170.2, "Weight": 65.4, "Age": 25, "Gender": "Male"},
    {"Height": 165.5, "Weight": 54.1, "Age": 31, "Gender": "Female"},
    {"Height": 180.3, "Weight": 72.0, "Age": 28, "Gender": "Male"},
    {"Height": 155.0, "Weight": 48.5, "Age": 22, "Gender": "Female"},
    {"Height": 175.6, "Weight": 70.2, "Age": 30, "Gender": "Male"},
    {"Height": 168.4, "Weight": 62.0, "Age": 29, "Gender": "Female"},
    {"Height": 183.2, "Weight": 85.3, "Age": 40, "Gender": "Male"},
]

# populate the dataset
for row in sample_data:
    dataset.addRow(row)

print("Dataset created and rows added.")




Dataset created and rows added.


In [3]:
# Retrieve column names and number of rows
column_names = dataset.getColumnNames()
print("Column Names:", column_names)
print("Number of rows in dataset:", dataset.size())

print("\nColumn type validation:")
for col in column_names:
    print(f"{col}: {'numeric' if dataset.isNumeric(col) else 'non-numeric'}")

# Retrieve columns with type checking
try:
    
    if dataset.isNumeric("Height"):
        heights = dataset.getColumn("Height")
        print("\nHeights:", heights)
        
    if dataset.isNumeric("Weight"):
        weights = dataset.getColumn("Weight")
        print("Weights:", weights)
        
    if dataset.isNumeric("Age"):
        ages = dataset.getColumn("Age")
        print("Ages:", ages)
    
    # Non-numeric columns need to use string type
    if not dataset.isNumeric("Gender"):
        genders = dataset.getColumn("Gender")
        print("Genders:", genders)
        
except RuntimeError as e:
    print(f"Error retrieving column: {e}")

Column Names: ['Gender', 'Age', 'Weight', 'Height']
Number of rows in dataset: 7

Column type validation:
Gender: non-numeric
Age: numeric
Weight: numeric
Height: numeric

Heights: [170.2, 165.5, 180.3, 155.0, 175.6, 168.4, 183.2]
Weights: [65.4, 54.1, 72.0, 48.5, 70.2, 62.0, 85.3]
Ages: [25.0, 31.0, 28.0, 22.0, 30.0, 29.0, 40.0]
Error retrieving column: No valid data of requested type found in column 'Gender'


In [14]:
# Create a StatisticalAnalyzer object
analyzer = stats.StatisticalAnalyzer(dataset)
print("StatisticalAnalyzer created.")


StatisticalAnalyzer created.


In [15]:
numeric_columns = [col for col in columns if dataset.isNumeric(col)]
print(f"All columns: {columns}")
print(f"Numeric columns: {numeric_columns}")

All columns: ['Gender', 'Age', 'Weight', 'Height']
Numeric columns: ['Age', 'Weight', 'Height']


In [16]:
for col in numeric_columns:
    mean_val = analyzer.mean(col)
    median_val = analyzer.median(col)
    var_val = analyzer.variance(col)
    std_val = analyzer.standardDeviation(col)
    
    print(f"\n--- {col} ---")
    print(f"Mean: {mean_val:.3f}")
    print(f"Median: {median_val:.3f}")
    print(f"Variance: {var_val:.3f}")
    print(f"Std Dev: {std_val:.3f}")



--- Age ---
Mean: 29.286
Median: 29.000
Variance: 27.347
Std Dev: 5.229

--- Weight ---
Mean: 65.357
Median: 65.400
Variance: 126.780
Std Dev: 11.260

--- Height ---
Mean: 171.171
Median: 170.200
Variance: 78.562
Std Dev: 8.864


In [7]:
# You can also get the mean of a column by passing the column_name directly
mean_age = analyzer.mean(column_names[1])

In [9]:
# frquency count for column with strings
gender_frequency = analyzer.frequencyCountStr("Gender")
print("Frequency Count for Gender:")
print(gender_frequency)


Frequency Count for Gender:
{'Female': 3, 'Male': 4}


In [17]:
# frquency count for column with numbers
age_frequency = analyzer.frequencyCount("Age")
print("Frequency Count for Age:")
print(age_frequency)


Frequency Count for Age:
{30.0: 1, 22.0: 1, 28.0: 1, 40.0: 1, 29.0: 1, 31.0: 1, 25.0: 1}


In [12]:
corr_matrix = analyzer.correlationMatrix(numeric_columns)
print("Correlation Matrix (Height, Weight, Age):")
print(corr_matrix)

Correlation Matrix (Height, Weight, Age):
[[1.         0.7591169  0.71922094]
 [0.7591169  1.         0.95574097]
 [0.71922094 0.95574097 1.        ]]


In [13]:
strong_corrs = analyzer.reportStrongCorrelations(numeric_columns, 0.7)
print("Strong Correlations (|corr| > 0.7):")
print(strong_corrs)


Strong Correlations (|corr| > 0.7):
Strong Correlations (|correlation| > 0.7):
Age - Weight: 0.759117
Age - Height: 0.719221
Weight - Height: 0.955741

