In [1]:
# Compilation
# uncomment the line above if you want to run this script directly
# !make

# uncomment the following lines to use the virtual environment
# !make venv
# !source venv/bin/activate

In [2]:
import os 
import numpy 
import matplotlib.pyplot as plt

import scientific_toolbox.stats as stats
from scientific_toolbox.stats import StatisticsAnalyzer, Dataset

In [None]:
# Instantiate the dataset
dataset = stats.Dataset()


sample_data = [
    {"Height": 170.2, "Weight": 65.4, "Age": 25, "Gender": "Male"},
    {"Height": 165.5, "Weight": 54.1, "Age": 31, "Gender": "Female"},
    {"Height": 180.3, "Weight": 72.0, "Age": 28, "Gender": "Male"},
    {"Height": 155.0, "Weight": 48.5, "Age": 22, "Gender": "Female"},
    {"Height": 175.6, "Weight": 70.2, "Age": 30, "Gender": "Male"},
    {"Height": 168.4, "Weight": 62.0, "Age": 29, "Gender": "Female"},
    {"Height": 183.2, "Weight": 85.3, "Age": 40, "Gender": "Male"},
]

# populate the dataset
for row in sample_data:
    dataset.addRow(row)

print("Dataset created and rows added.")




In [None]:
# Retrieve column names and number of rows
column_names = dataset.getColumnNames()
print("Column Names:", column_names)
print("Number of rows in dataset:", dataset.size())

print("\nColumn type validation:")
for col in column_names:
    print(f"{col}: {'numeric' if dataset.isNumeric(col) else 'non-numeric'}")



In [None]:
# Create a StatisticalAnalyzer object
analyzer = stats.StatisticalAnalyzer(dataset)
print("StatisticalAnalyzer created.")


In [None]:
numeric_columns = [col for col in column_names if dataset.isNumeric(col)]
for col in numeric_columns:

    mean_val = analyzer.mean(col)
    median_val = analyzer.median(col)
    var_val = analyzer.variance(col)
    std_val = analyzer.standardDeviation(col)
    
    print(f"\n--- {col} ---")
    print(f"Mean: {mean_val:.3f}")
    print(f"Median: {median_val:.3f}")
    print(f"Variance: {var_val:.3f}")
    print(f"Std Dev: {std_val:.3f}")


In [7]:
# You can also get the mean of a column by passing the column_name directly
mean_age = analyzer.mean(column_names[1])

In [None]:
# frquency count for column with strings
gender_frequency = analyzer.frequencyCountStr("Gender")
print("Frequency Count for Gender:")
print(gender_frequency)


In [None]:
# frquency count for column with numbers
age_frequency = analyzer.frequencyCount("Age")
print("Frequency Count for Age:")
print(age_frequency)


In [None]:
corr_matrix = analyzer.correlationMatrix(numeric_columns)
print("Correlation Matrix (Height, Weight, Age):")
print(corr_matrix)

In [None]:
strong_corrs = analyzer.reportStrongCorrelations(numeric_columns, 0.7)
print("Strong Correlations (|corr| > 0.7):")
print(strong_corrs)
