In [26]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, output_file, save, show
import unittest
from sqlalchemy import create_engine, MetaData, Table, Column, Float
from scipy.optimize import curve_fit
from bokeh.models import ColumnDataSource
from bokeh.models import LinearColorMapper
from bokeh.layouts import column,row,gridplot
import warnings
warnings.simplefilter(action="ignore")

In [27]:
class DataProcessor:
    """
    A class to process training, ideal function, and test data.

    Attributes:
        training_data (pd.DataFrame): Training dataset.
        ideal_functions (pd.DataFrame): Ideal functions dataset.
        test_data (pd.DataFrame): Test dataset.
        engine (sqlalchemy.engine.Engine): SQLite database engine.
    """

    def __init__(self, training_file, ideal_functions_file, test_file, database_file):
        """
        Initializes DataProcessor with file paths and sets up data structures.

        Args:
            training_file (str): Path to the training dataset CSV file.
            ideal_functions_file (str): Path to the ideal functions CSV file.
            test_file (str): Path to the test dataset CSV file.
            database_file (str): Path to the SQLite database file.
        """
        self.training_data = pd.read_csv(training_file)
        self.ideal_functions = pd.read_csv(ideal_functions_file)
        self.test_data = pd.read_csv(test_file)
        self.database_file = database_file  # Added this line to initialize the attribute
        self.engine = None  # Initialize engine to None

    def setup_database(self):
        """
        Sets up an SQLite database and loads training and ideal functions data.
        """
        try:
            # Create an SQLite database engine
            self.engine = create_engine(f'sqlite:///{self.database_file}')

            # Create metadata
            metadata = MetaData()

            # Define training_data_table schema
            training_data_table = Table('training_data_table', metadata,
                                       Column('X', Float),
                                       Column('Y', Float),
                                       # Add other columns as needed
                                       )

            # Define ideal_functions_table schema
            ideal_functions_table = Table('ideal_functions_table', metadata,
                                         Column('X', Float),
                                         # Add other columns as needed
                                         )

            # Create tables in the database
            metadata.create_all(self.engine)

            # Load training data into the database
            self.training_data.to_sql('training_data_table', self.engine, index=False, if_exists='replace')

            # Load ideal functions data into the database
            self.ideal_functions.to_sql('ideal_functions_table', self.engine, index=False, if_exists='replace')

        except Exception as e:
            raise DataProcessorException(f"Error setting up the database: {e}")

    def least_squares_fit(self):
        """
        Finds the best-fit functions based on the least-squares criterion using the training data.
        """
        try:
            # Extract x and y values from training data
            x_train = self.training_data['X'].values
            y_train = self.training_data['Y'].values

            # Function to fit (you may need to adjust this based on your ideal function representation)
            def ideal_function(x, *params):
                # Example: Linear function (you can replace this with your actual ideal function)
                return params[0] * x + params[1]

            # Iterate over ideal functions and find parameters that minimize squared deviations
            best_fit_params = []
            for _, ideal_row in self.ideal_functions.iterrows():
                x_ideal = ideal_row['X']
                initial_params = [1.0, 0.0]  # Initial guess for parameters

                try:
                    # Use curve_fit from scipy.optimize to find parameters
                    params, _ = curve_fit(ideal_function, x_train, y_train, p0=initial_params)

                    # Save the best-fit parameters for each ideal function
                    best_fit_params.append(params)

                except Exception as e:
                    print(f"Error fitting ideal function: {e}")

            # Save best-fit parameters in the DataProcessor instance for later use
            self.best_fit_params = best_fit_params

        except Exception as e:
            raise DataProcessorException(f"Error during least-squares fit: {e}")

    def map_test_data(self):
        """
        Maps test data to ideal functions based on mapping criteria.
        """
        try:
            # Extract x and y values from test data
            x_test = self.test_data['X'].values
            y_test = self.test_data['Y'].values

            # Mapping criterion: maximum allowed deviation factor
            max_deviation_factor = np.sqrt(2)

            # Initialize results dataframe
            results = pd.DataFrame(columns=['X', 'Y', 'Ideal_Function', 'Deviation'])

            # Iterate over test data and find the best-matching ideal function
            for x, y in zip(x_test, y_test):
                deviations = []

                # Calculate deviation for each ideal function
                for i, params in enumerate(self.best_fit_params):
                    # Example: Linear function (replace with your actual ideal function)
                    ideal_y = params[0] * x + params[1]
                    deviation = abs(y - ideal_y)
                    deviations.append(deviation)

                # Check if deviations list is not empty
                if deviations:
                    # Find the index of the ideal function with the minimum deviation
                    best_fit_index = np.argmin(deviations)

                    # Check if the deviation is within the allowed factor
                    if deviations[best_fit_index] <= max_deviation_factor * np.max(deviations):
                        results = results.append({'X': x,
                                                  'Y': y,
                                                  'Ideal_Function': best_fit_index,
                                                  'Deviation': deviations[best_fit_index]}, ignore_index=True)

            # Save results in the DataProcessor instance for later use
            self.results = results

        except Exception as e:
            raise DataProcessorException(f"Error mapping test data: {e}")

    def visualize_data(self):
        """
        Visualizes training data, ideal functions, and test data.
        """
        try:
            # Extract x and y values from training, ideal functions, and test data
            x_train = self.training_data['X'].values
            y_train = self.training_data['Y'].values

            x_ideal = self.ideal_functions['X'].values
            # Example: Linear function (replace with your actual ideal function representation)
            y_ideal = [params[0] * x + params[1] for x, params in zip(x_ideal, self.best_fit_params)]

            x_test = self.test_data['X'].values
            y_test = self.test_data['Y'].values

            # Plotting training data
            source_train = ColumnDataSource(data=dict(x=x_train, y=y_train))
            plot_train = figure(title="Training Data", tools="pan,box_zoom,reset", width=400, height=400)
            plot_train.circle('x', 'y', source=source_train, size=8, color="navy", alpha=0.6)

            # Plotting ideal functions
            source_ideal = ColumnDataSource(data=dict(x=x_ideal, y=y_ideal))
            plot_ideal = figure(title="Ideal Functions", tools="pan,box_zoom,reset", width=400, height=400)
            plot_ideal.line('x', 'y', source=source_ideal, line_width=2, line_color="red")

            # Plotting test data with assigned ideal functions and deviations
            source_test = ColumnDataSource(data=dict(x=x_test, y=y_test, ideal=self.results['Ideal_Function'],
                                                     deviation=self.results['Deviation']))
            plot_test = figure(title="Test Data", tools="pan,box_zoom,reset", width=400, height=400)
            plot_test.circle('x', 'y', source=source_test, size=8, color="green", alpha=0.6)

            # Add details to the test plot, such as color-coding based on assigned ideal function
            color_mapper = LinearColorMapper(palette="Viridis256", low=0, high=self.ideal_functions.shape[0])
            plot_test.circle('x', 'y', source=source_test, size=8, color={'field': 'ideal', 'transform': color_mapper},
                             legend_field="ideal")

            # Create a grid layout for the plots
            grid = gridplot([[plot_train, plot_ideal], [None, plot_test]])

            # Output the visualization to an HTML file
            output_file("output/visualization.html")
            show(grid)

        except Exception as e:
            raise DataProcessorException(f"Error during data visualization: {e}")

In [28]:
class DataProcessorException(Exception):
    """Custom exception for DataProcessor errors."""
    pass

In [29]:
class TestDataProcessor(unittest.TestCase):
    """
    Unit tests for DataProcessor class methods.
    """

    def test_setup_database(self):
        """
        Test the setup_database method.
        """
        # Create temporary file paths for testing
        test_database_file = "test_database.db"
        test_training_file = "test_training_data.csv"
        test_ideal_functions_file = "test_ideal_functions.csv"

        try:
            # Create instance of DataProcessor for testing
            test_processor = DataProcessor(test_training_file, test_ideal_functions_file, "test_test_data.csv", test_database_file)

            # Call setup_database method
            test_processor.setup_database()

            # Check if the database file is created
            self.assertTrue(os.path.isfile(test_database_file))

            # Check if training data is loaded into the database
            loaded_training_data = pd.read_sql("SELECT * FROM training_data_table", test_processor.engine)
            self.assertTrue(loaded_training_data.equals(test_processor.training_data))

            # Check if ideal functions data is loaded into the database
            loaded_ideal_functions = pd.read_sql("SELECT * FROM ideal_functions_table", test_processor.engine)
            self.assertTrue(loaded_ideal_functions.equals(test_processor.ideal_functions))

        except Exception as e:
            self.fail(f"Test failed: {e}")

        finally:
            # Clean up temporary files
            os.remove(test_database_file)
            os.remove(test_training_file)
            os.remove(test_ideal_functions_file)
        pass

    def test_least_squares_fit(self):
        """
        Test the least_squares_fit method.
        """
        try:
            # Create instance of DataProcessor for testing
            test_processor = DataProcessor("test_training_data.csv", "test_ideal_functions.csv", "test_test_data.csv", "test_database.db")

            # Setup database for testing
            test_processor.setup_database()

            # Call least_squares_fit method
            test_processor.least_squares_fit()

            # Check if best_fit_params is calculated and has the correct length
            self.assertTrue(hasattr(test_processor, 'best_fit_params'))
            self.assertEqual(len(test_processor.best_fit_params), len(test_processor.ideal_functions))

            # Check if best_fit_params contains NumPy arrays
            for params in test_processor.best_fit_params:
                self.assertTrue(isinstance(params, np.ndarray))

        except Exception as e:
            self.fail(f"Test failed: {e}")
        pass

    def test_map_test_data(self):
        """
        Test the map_test_data method.
        """
        try:
            # Create instance of DataProcessor for testing
            test_processor = DataProcessor("test_training_data.csv", "test_ideal_functions.csv", "test_test_data.csv", "test_database.db")

            # Setup database and perform least-squares fit for testing
            test_processor.setup_database()
            test_processor.least_squares_fit()

            # Call map_test_data method
            test_processor.map_test_data()

            # Check if results is calculated and has the correct columns
            self.assertTrue(hasattr(test_processor, 'results'))
            self.assertSetEqual(set(test_processor.results.columns), {'X', 'Y', 'Ideal_Function', 'Deviation'})

            # Check if results dataframe is not empty
            self.assertFalse(test_processor.results.empty)

            # Check if Ideal_Function column values are within the expected range
            self.assertTrue((0 <= test_processor.results['Ideal_Function']).all())
            self.assertTrue((test_processor.results['Ideal_Function'] < len(test_processor.best_fit_params)).all())

        except Exception as e:
            self.fail(f"Test failed: {e}")
        pass

    def test_visualize_data(self):
        """
        Test the visualize_data method.
        """
        try:
            # Create instance of DataProcessor for testing
            test_processor = DataProcessor("test_training_data.csv", "test_ideal_functions.csv", "test_test_data.csv", "test_database.db")

            # Setup database, perform least-squares fit, and map test data for testing
            test_processor.setup_database()
            test_processor.least_squares_fit()
            test_processor.map_test_data()

            # Call visualize_data method
            test_processor.visualize_data()

            # Check if the visualization HTML file is created
            self.assertTrue(os.path.isfile("output/visualization.html"))

        except Exception as e:
            self.fail(f"Test failed: {e}")

        finally:
            # Clean up temporary files
            os.remove("output/visualization.html")
        pass

In [30]:
if __name__ == "__main__":
    # Example usage
    processor = DataProcessor("data/train.csv", "data/ideal.csv", "data/test.csv", "output/your_database.db")
    processor.setup_database()
    processor.least_squares_fit()
    processor.map_test_data()
    processor.visualize_data()

    # Run unit tests
    unittest.main()

E
ERROR: C:\Users\home\AppData\Roaming\jupyter\runtime\kernel-19ee7d1b-4b23-4e6c-a989-2d261f00c8de (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute 'C:\Users\home\AppData\Roaming\jupyter\runtime\kernel-19ee7d1b-4b23-4e6c-a989-2d261f00c8de'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True