In [1]:
import os
import pandas as pd

In [2]:
%pwd

'd:\\software\\python_vs\\Bone_marrow_survival_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\software\\python_vs\\Bone_marrow_survival_prediction'

## Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationconfig:
    root_dir : Path
    data_path : Path

## ConfigurationManager

In [6]:
from Bone_marrow_survival_prediction.constants import *
from Bone_marrow_survival_prediction.utils.comman import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
        ):
    
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
    
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationconfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationconfig(
            root_dir = config.root_dir,
            data_path = config.data_path
        )
        
        return data_transformation_config
    

## Coponents

In [10]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
from collections import Counter
import logging


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


    
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    @staticmethod
    def is_string_numeric(val):
        try:
            float(val)
            return True
        except ValueError:
            return False

    def preprocess_data(self):
        data = pd.read_csv(self.config.data_path)

        if "Disease" in data.columns and data["Disease"].dtype == "O":
            encoder = OrdinalEncoder()
            data["Disease_encoded"] = encoder.fit_transform(data[["Disease"]])
            data.drop("Disease", axis=1, inplace=True)

        data.replace("?", np.nan, inplace=True)
        data.dropna(inplace=True)

        for feature in data.columns:
            if data[feature].dtype == "O":
                is_numeric = data[feature].apply(self.is_string_numeric)
                if is_numeric.all():  
                    data[feature] = data[feature].astype("float64")
                    logger.info(f"The {feature} feature is converted into float")

        return data

    def train_test_split(self):
        data = self.preprocess_data()  
         

        X = data.drop("survival_status",axis = 1)
        y = data["survival_status"]

        mi = mutual_info_classif(X, y)
        mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi})
        mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
        top_features = mi_df.head(5)['Feature']

        X = data[top_features]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        desired_percentage = 0.5

        current_counts = Counter(y_train)
        total_samples = len(y_train)
        minority_class = min(current_counts, key=current_counts.get)
        majority_class = max(current_counts, key=current_counts.get)

        desired_minority_count = int(total_samples * desired_percentage)
        minority_samples_needed = desired_minority_count - current_counts[minority_class]

        # Apply SMOTE to balance the dataset
        smote = SMOTE(sampling_strategy={minority_class: current_counts[minority_class] + minority_samples_needed})
        X_train, y_train = smote.fit_resample(X_train, y_train)

        X_train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        y_train.to_csv(os.path.join(self.config.root_dir, "y_train.csv"), index=False)
        X_test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)
        y_test.to_csv(os.path.join(self.config.root_dir, "y_test.csv"), index=False)

        logger.info("Splited data into train and test sets")


In [23]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.train_test_split()
except Exception as e:
    raise e
    
    

[ 2025-01-23 20:17:30,737: INFO: comman: yaml file: config\config.yaml loaded successfully]
[ 2025-01-23 20:17:30,738: INFO: comman: yaml file: params.yaml loaded successfully]
[ 2025-01-23 20:17:30,741: INFO: comman: yaml file: schema.yaml loaded successfully]
[ 2025-01-23 20:17:30,742: INFO: comman: created directory at: artifacts]
[ 2025-01-23 20:17:30,743: INFO: comman: created directory at: artifacts/data_validation]
[ 2025-01-23 20:17:30,749: INFO: 3893366604: The RecipientABO feature is converted into float]
[ 2025-01-23 20:17:30,750: INFO: 3893366604: The RecipientRh feature is converted into float]
[ 2025-01-23 20:17:30,750: INFO: 3893366604: The ABOmatch feature is converted into float]
[ 2025-01-23 20:17:30,750: INFO: 3893366604: The CMVstatus feature is converted into float]
[ 2025-01-23 20:17:30,756: INFO: 3893366604: The DonorCMV feature is converted into float]
[ 2025-01-23 20:17:30,757: INFO: 3893366604: The RecipientCMV feature is converted into float]
[ 2025-01-23 20: