In [1]:
import os

In [2]:
%pwd

'/Users/chaky/Desktop/data science/projects/end-to-end-machine-learning-project-with-MLflow/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/chaky/Desktop/data science/projects/end-to-end-machine-learning-project-with-MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    target_column: str

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.ElasticNet
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            alpha = params.alpha,
            l1_ratio = params.l1_ratio,
            target_column = schema.name
            
        )

        return model_trainer_config

In [8]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import os

data=pd.read_csv("artifacts/data_transformation/train.csv")
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,2.858425,-0.568168,1.594954,-0.441955,-0.366751,-0.086042,-0.200807,1.924231,-1.642385,-1.025162,-1.360189,6.0
1,-0.488669,0.277559,-1.227135,1.052166,-0.145401,1.053901,-0.022746,-0.443831,-1.907876,-1.148597,-1.16893,5.0
2,-1.931382,-0.737314,1.184469,1.187995,-0.897992,1.243892,0.986263,-3.467338,1.211652,0.024038,3.03877,6.0
3,0.954044,-1.019223,0.927915,-0.034467,0.12022,0.388934,1.223677,1.279357,-1.045028,-1.395468,-0.97767,5.0
4,0.896335,-1.583041,0.979226,-0.577784,-0.543832,0.958906,-0.141453,-0.454402,-0.845909,1.073239,0.169884,6.0


In [9]:
data=pd.read_csv("artifacts/data_transformation/test.csv")
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.488669,-0.229877,-0.354853,0.033448,5.43263,-0.751009,-0.972402,0.021324,-0.64679,1.505262,0.074254,
1,-1.008046,-0.060732,-1.175824,-0.238211,-0.388886,-0.751009,-0.942726,-0.612978,0.614296,-0.161114,-0.97767,
2,-0.546378,-0.00435,-0.765338,-0.441955,1.736078,-1.130991,-1.031756,0.137613,-0.314926,1.258392,-1.16893,
3,-0.604086,0.89776,0.260876,-0.238211,-0.388886,1.813863,1.698505,-0.232397,0.149685,-0.901726,-0.882041,
4,1.761963,0.390323,1.133158,0.101362,-0.034726,-0.561019,-0.17113,1.395645,-1.045028,-0.161114,0.074254,


In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import joblib

# Define the feature matrix (X) and target variable (y)
X = data.drop('quality', axis=1)
y = data['quality']
y.head()


0    5.0
1    5.0
2    5.0
3    6.0
4    5.0
Name: quality, dtype: float64

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    target_column: str

In [None]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model_name  # Get parameters based on the model name
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
        root_dir=config.root_dir,
        train_data_path=config.train_data_path,
        test_data_path=config.test_data_path,
        model_name=model_name,  # Use the provided model name
        target_column=schema.name,
        max_iter=params.get("max_iter", None),  # Get the max_iter parameter, default to None if not present
        C=params.get("C", None),  # Get the C parameter, default to None if not present
        criterion=params.get("criterion", None),  # Get the criterion parameter, default to None if not present
        max_depth=params.get("max_depth", None),  # Get the max_depth parameter, default to None if not present
        min_samples_split=params.get("min_samples_split", None),  # Get the min_samples_split parameter, default to None if not present
        n_estimators=params.get("n_estimators", None),  # Get the n_estimators parameter, default to None if not present
        learning_rate=params.get("learning_rate", None),  # Get the learning_rate parameter, default to None if not present
        )

        return model_trainer_config

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import os

In [None]:
train_data = pd.read_csv(self.config.train_data_path)


In [None]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    
    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]


        lr = ElasticNet(alpha=self.config.alpha, l1_ratio=self.config.l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        joblib.dump(lr, os.path.join(self.config.root_dir, self.config.model_name))


# Load the dataset (assuming you have it in a DataFrame named 'data')
# Replace 'data.csv' with your actual dataset file path
# data = pd.read_csv('data.csv')

# Separate features (X) and target (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Define a list of classifiers and their hyperparameters
classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=self.config.max_iter, C=self.config.C)),
    ('Decision Tree', DecisionTreeClassifier(criterion=self.config.criterion,max_depth=self.config.max_depth,min_samples_split=self.config.min_samples_split)),
    ('Random Forest', RandomForestClassifier(n_estimators=self.config.n_estimators,max_depth=self.config.max_depth,min_samples_split=self.config.min_samples_split)),
    ('XGBoost', XGBClassifier(n_estimators=self.config.n_estimators,max_depth=self.config.max_depth,learning_rate=self.config.learning_rate))
]

# Create a dictionary to store trained models and their cross-validation scores
trained_models = {}

# Loop through classifiers, create pipelines, and train models with cross-validation
for model_name, clf in classifiers:
    pipeline = Pipeline([
        ('classifier', clf)
    ])
    
    # Perform cross-validation and store the scores
    cross_val_scores = cross_val_score(pipeline, X, y, cv=5)  # 5-fold cross-validation
    
    # Calculate the mean cross-validation score
    mean_accuracy = cross_val_scores.mean()
    
    # Store the trained model and its mean cross-validation accuracy
    trained_models[clf_name] = {
        'model': pipeline,
        'mean_accuracy': mean_accuracy
    }

# You can access the trained models and their mean cross-validation accuracy using trained_models dictionary
for clf_name, model_info in trained_models.items():
    print(f"{clf_name}: Mean Cross-Validation Accuracy - {model_info['mean_accuracy']:.2f}")

# Now you have trained models with cross-validation for wine quality prediction.


In [None]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e