In [1]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/DNA_classifier/research'

In [2]:
os.chdir("../")
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/DNA_classifier'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    params: list
    target_column: str

In [9]:
from dnaseq.constants import *
from dnaseq.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = [d[1] for d in list(self.params.items())]
        schema = self.schema.TARGET_COLUMN
        
        create_directories([config.root_dir])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            params=params,
            target_column=schema.name
        )
        
        return model_trainer_config

In [5]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from dnaseq import logger
import joblib

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.cv = CountVectorizer()
        self.clf1 = LogisticRegression()
        self.clf2 = MultinomialNB()
        self.config = config
        self.config.params[0]['classifier'] = [self.clf1]
        self.config.params[1]['classifier'] = [self.clf2]
        
    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        
        train_x = train_data.drop([self.config.target_column], axis=1).values.astype('U')
        train_y = train_data[self.config.target_column]
        
        outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
        
        pipeline = Pipeline(
            [
                ('preprocessor', self.cv),
                ('classifier', self.clf1)
            ]
        )
        
        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=self.config.params,
            scoring='accuracy',
            cv=outer_cv,
            n_jobs=1,
            verbose=3,
            return_train_score=True
        )
        
        grid.fit(train_x.ravel(), train_y.ravel())
        
        print(grid.best_params_)
        print(grid.best_score_)
        
        joblib.dump(grid.best_estimator_, os.path.join(self.config.root_dir, self.config.model_name))

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise e

[2024-04-02 20:11:54,738: INFO: common: created directory at: artifacts]
[2024-04-02 20:11:54,740: INFO: common: created directory at: artifacts/model_trainer]
