<a href="https://colab.research.google.com/github/Saeidhoseinipour/100Data/blob/main/ModelTrainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, silhouette_score

class ModelTrainer:
    def __init__(self, datasets):
        """
        Initialize the class with cleaned datasets.
        :param datasets: Dictionary of cleaned datasets from BioinformaticsDatasetCleaner.
        """
        self.datasets = datasets

    def train_model(self, dataset_name, target_column):
        """
        Train a machine learning model based on the dataset type.
        :param dataset_name: Name of the dataset.
        :param target_column: Name of the target column.
        """
        if dataset_name in self.datasets:
            dataset = self.datasets[dataset_name]['data']
            dataset_type = self.datasets[dataset_name]['type']

            # Split data into features and target
            X = dataset.drop(columns=[target_column])
            y = dataset[target_column]

            # Split into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Train a model based on dataset type
            if dataset_type == 'quantitative':
                # Regression model for quantitative data
                model = RandomForestRegressor()
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Evaluate regression model
                mse = mean_squared_error(y_test, y_pred)
                print(f"Mean Squared Error for {dataset_name}: {mse:.2f}")

            elif dataset_type == 'qualitative':
                # Classification model for qualitative data
                model = RandomForestClassifier()
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Evaluate classification model
                accuracy = accuracy_score(y_test, y_pred)
                print(f"Model Accuracy for {dataset_name}: {accuracy:.2f}")
                print("Classification Report:\n", classification_report(y_test, y_pred))
                print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

            elif dataset_type == 'unsupervised':
                # Clustering model for unsupervised data
                model = KMeans(n_clusters=3)
                model.fit(X)
                labels = model.labels_

                # Evaluate clustering model
                silhouette = silhouette_score(X, labels)
                print(f"Silhouette Score for {dataset_name}: {silhouette:.2f}")

            else:
                raise ValueError(f"No model defined for {dataset_type} data.")

            return model, X_test, y_test
        else:
            raise KeyError(f"Dataset {dataset_name} not found.")

    def apply_pca(self, dataset_name, n_components=2):
        """
        Apply PCA for dimensionality reduction.
        :param dataset_name: Name of the dataset.
        :param n_components: Number of components for PCA.
        """
        if dataset_name in self.datasets:
            dataset = self.datasets[dataset_name]['data']
            pca = PCA(n_components=n_components)
            transformed_data = pca.fit_transform(dataset)
            print(f"PCA applied to {dataset_name}. Transformed data shape: {transformed_data.shape}")
            return transformed_data
        else:
            raise KeyError(f"Dataset {dataset_name} not found.")