In [1]:
import os
os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: list
    mlflow_uri: str

In [3]:
from anidex.constants import *
from anidex.utils.common import read_yaml, create_directories
import tensorflow as tf

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    # def get_model_training_config(self) -> TrainingConfig:
    #     config = self.config.training
        
    #     create_directories([config.root_dir])

    #     model_training_config = TrainingConfig(
    #         root_dir=config.root_dir,
    #         training_data=config.training_data,
    #         trained_model_file_path=config.trained_model_file_path,
    #         trained_model_file_path_rent=config.trained_model_file_path_rent,
    #         mlflow_uri=config.mlflow_uri,
    #         model_params = self.params
 
    #     )

    #     return model_training_config
    
    def get_training_config(self) -> TrainingConfig:
        config = self.config
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params
        training_data = os.path.join(self.config.data_ingestion.unzip_dir, "animals")
        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(training_data),
            params_epochs=params.EPOCHS,
            params_batch_size=params.BATCH_SIZE,
            params_is_augmentation=params.AUGMENTATION,
            params_image_size=params.IMAGE_SIZE,
            # mlflow_uri= training.mlflow_uri
        )

        return training_config

In [5]:
import os
import urllib.request as request
from zipfile import ZipFile
import tensorflow as tf
import time

In [6]:
# Tensorflow Libraries
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Dropout , BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers,models,Model
# from keras.preprocessing.image import ImageDataGenerator
# from tensorflow.keras.layers import preprocessing
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')


print(tf.__version__)

2.16.1


In [8]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [9]:
import pandas as pd

In [10]:


class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config
        
    def get_base_model(self):
        self.model = tf.keras.models.load_model(
            self.config.updated_base_model_path
        )

    def train_valid_generator(self):
        # Define the path to the main data folder
        data_dir = os.path.join(self.config.training_data, 'animals')

        # Get the list of class names (animal names)
        classes = os.listdir(data_dir)

        # Split data into train, validation, and test sets for each class
        train_data = []
        valid_data = []
        test_data = []

        for class_name in classes:
            class_dir = os.path.join(data_dir, class_name)
            images = os.listdir(class_dir)
            train_images, temp_images = train_test_split(images, train_size=0.70, shuffle=True, random_state=124)
            valid_images, test_images = train_test_split(temp_images, train_size=0.70, shuffle=True, random_state=124)

            train_data.extend([(os.path.join(class_dir, image), class_name) for image in train_images])
            valid_data.extend([(os.path.join(class_dir, image), class_name) for image in valid_images])
            test_data.extend([(os.path.join(class_dir, image), class_name) for image in test_images])

        # Convert data lists to DataFrames
        train_df = pd.DataFrame(train_data, columns=['imgpath', 'labels'])
        valid_df = pd.DataFrame(valid_data, columns=['imgpath', 'labels'])
        test_df = pd.DataFrame(test_data, columns=['imgpath', 'labels'])

        # Reset index for DataFrame consistency
        train_df = train_df.reset_index(drop=True)
        valid_df = valid_df.reset_index(drop=True)
        test_df = test_df.reset_index(drop=True)

        # Print information about splits
        print("----------Train-------------")
        print(train_df.head(5))
        print(train_df.shape)
        print("--------Validation----------")
        print(valid_df.head(5))
        print(valid_df.shape)
        print("----------Test--------------")
        print(test_df.head(5))
        print(test_df.shape)

        # Set up ImageDataGenerator for image augmentation and preprocessing
        generator = ImageDataGenerator(
            preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
            rotation_range=40 if self.config.params_is_augmentation else 0,
            horizontal_flip=True if self.config.params_is_augmentation else False,
            width_shift_range=0.2 if self.config.params_is_augmentation else 0,
            height_shift_range=0.2 if self.config.params_is_augmentation else 0,
            shear_range=0.2 if self.config.params_is_augmentation else 0,
            zoom_range=0.2 if self.config.params_is_augmentation else 0,
            rescale=1./255,
            validation_split=0.20
        )

        # Set up data generators for training, validation, and testing
        self.train_images = generator.flow_from_dataframe(
            dataframe=train_df,
            x_col='imgpath',
            y_col='labels',
            target_size=self.config.params_image_size[:-1],
            batch_size=self.config.params_batch_size,
            class_mode='categorical',
            shuffle=True,
            subset='training'
        )

        self.val_images = generator.flow_from_dataframe(
            dataframe=valid_df,
            x_col='imgpath',
            y_col='labels',
            target_size=self.config.params_image_size[:-1],
            batch_size=self.config.params_batch_size,
            class_mode='categorical',
            shuffle=False,
            subset='validation'
        )

        self.test_generator = generator.flow_from_dataframe(
            dataframe=test_df,
            x_col='imgpath',
            y_col='labels',
            target_size=self.config.params_image_size[:-1],
            batch_size=self.config.params_batch_size,
            class_mode='categorical',
            shuffle=False
        )

        

    
    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        model.save(path)



    
    def train(self):
        # self.steps_per_epoch = self.train_generator.samples // (self.train_generator.batch_size*20)
        # self.validation_steps = self.valid_generator.samples // (self.valid_generator.batch_size*20)

        
        self.model.fit(
            self.train_images,
            steps_per_epoch=len(self.train_images),
            validation_data=self.val_images,
            validation_steps=len(self.val_images),
            epochs=10,
            callbacks=[
                EarlyStopping(monitor = "val_loss", # watch the val loss metric
                                    patience = 3,
                                    restore_best_weights = True), # if val loss decreases for 10 epochs in a row, stop training,
                ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, mode='min') 
            ]
        )
        self.model.save_weights('./checkpoints/my_checkpoint')


        # self.model.fit(
        #     self.train_generator,
        #     epochs=self.config.params_epochs,
        #     steps_per_epoch=self.steps_per_epoch,
        #     validation_steps=self.validation_steps,
        #     validation_data=self.valid_generator
        # )

        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )



In [11]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    training.train_valid_generator()
    training.train()
    
except Exception as e:
    raise e


[2024-04-02 21:13:40,249: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-02 21:13:40,254: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-02 21:13:40,255: INFO: common: created directory at: artifacts]
[2024-04-02 21:13:40,256: INFO: common: created directory at: artifacts\training]


  trackable.load_own_variables(weights_store.get(inner_path))
  trackable.load_own_variables(weights_store.get(inner_path))


----------Train-------------
                                             imgpath    labels
0  artifacts\data_ingestion\animals\animals\antel...  antelope
1  artifacts\data_ingestion\animals\animals\antel...  antelope
2  artifacts\data_ingestion\animals\animals\antel...  antelope
3  artifacts\data_ingestion\animals\animals\antel...  antelope
4  artifacts\data_ingestion\animals\animals\antel...  antelope
(3780, 2)
--------Validation----------
                                             imgpath    labels
0  artifacts\data_ingestion\animals\animals\antel...  antelope
1  artifacts\data_ingestion\animals\animals\antel...  antelope
2  artifacts\data_ingestion\animals\animals\antel...  antelope
3  artifacts\data_ingestion\animals\animals\antel...  antelope
4  artifacts\data_ingestion\animals\animals\antel...  antelope
(1080, 2)
----------Test--------------
                                             imgpath    labels
0  artifacts\data_ingestion\animals\animals\antel...  antelope
1  artifact

  self._warn_if_super_not_called()


[1m 17/202[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:08:39[0m 42s/step - accuracy: 0.0041 - loss: 5.2824

In [8]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    
    def get_base_model(self):
        self.model = tf.keras.models.load_model(
            self.config.updated_base_model_path
        )

    def train_valid_generator(self):

        datagenerator_kwargs = dict(
            rescale = 1./255,
            validation_split=0.20
        )

        dataflow_kwargs = dict(
            target_size=self.config.params_image_size[:-1],
            batch_size=self.config.params_batch_size,
            interpolation="bilinear"
        )

        valid_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(
            **datagenerator_kwargs
        )

        self.valid_generator = valid_datagenerator.flow_from_directory(
            directory=self.config.training_data,
            subset="validation",
            shuffle=False,
            **dataflow_kwargs
        )

        if self.config.params_is_augmentation:
            train_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(
                rotation_range=40,
                horizontal_flip=True,
                width_shift_range=0.2,
                height_shift_range=0.2,
                shear_range=0.2,
                zoom_range=0.2,
                **datagenerator_kwargs
            )
        else:
            train_datagenerator = valid_datagenerator

        self.train_generator = train_datagenerator.flow_from_directory(
            directory=self.config.training_data,
            subset="training",
            shuffle=True,
            **dataflow_kwargs
        )

    
    @staticmethod
    def save_model(path: Path, model: tf.keras.Model):
        model.save(path)



    
    def train(self):
        self.steps_per_epoch = self.train_generator.samples // (self.train_generator.batch_size*20)
        self.validation_steps = self.valid_generator.samples // (self.valid_generator.batch_size*20)

        self.model.fit(
            self.train_generator,
            epochs=self.config.params_epochs,
            steps_per_epoch=self.steps_per_epoch,
            validation_steps=self.validation_steps,
            validation_data=self.valid_generator
        )

        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )



In [9]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_base_model()
    training.train_valid_generator()
    training.train()
    
except Exception as e:
    raise e


[2024-04-02 20:08:12,631: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-02 20:08:12,636: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-02 20:08:12,638: INFO: common: created directory at: artifacts]
[2024-04-02 20:08:12,640: INFO: common: created directory at: artifacts\training]
Found 1080 images belonging to 90 classes.


  trackable.load_own_variables(weights_store.get(inner_path))


Found 4320 images belonging to 90 classes.


  self._warn_if_super_not_called()


[1m 5/14[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m6:49[0m 45s/step - accuracy: 0.0000e+00 - loss: 5.2533

KeyboardInterrupt: 

In [26]:
class ModelTrainer:
    def __init__(self, config: TrainingConfig):
        self.config = config
        


    def initiate_model_trainer(self):
        DF = pd.read_csv(self.config.training_data)


        imp_feature = ['propertyType',
                       'locality',
                       'furnishing',
                       'city',
                       'bedrooms',
                       'bathrooms',
                       'RentOrSale',]

        DF = remove_outliers_iqr(DF)
        try:
            logging.info("Split training and test input data")
            X_train, X_test, y_train, y_test = train_test_split(
                DF[imp_feature], DF[["exactPrice"]], test_size=0.33, random_state=42)
            
            print(y_train)
            # To convert y_train to required format
            y_train = np.ravel(y_train)
            print(y_train)

            models = {
                "RandomForest": RandomForestRegressor(),
                "DecisionTree": DecisionTreeRegressor(),
                "GradientBoosting": GradientBoostingRegressor(),
                "LinearRegression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoostRegressor": CatBoostRegressor(verbose=False),


                "AdaBoostRegressor": AdaBoostRegressor(),
            }
            print("OKK 1")

            params = self.config.model_params
            print(params.MODELS)
            print("OKK 2")
            models = {key: value for key, value in models.items() if key in params.MODELS}
            print(models)

            print("OKK 3")


            model_report,parameters = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                                                 models=models, param=params.MODELS)
            print("OKK 4")
            logging.info(f"models -> {model_report}")


            
        
        
            # To get best model score from dict
            best_model_score = max(sorted(model_report.values()))

            # To get best model name from dict

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            best_model = models[best_model_name]

            if best_model_score < 0.6:
                raise CustomException("No best model found")
            logging.info(
                f"Best found model on both training and testing dataset")

            save_object(
                file_path=self.config.trained_model_file_path,
                obj=best_model
            )

            predicted = best_model.predict(X_test)

            r2_square = r2_score(y_test, predicted)
            mae = mean_absolute_error(y_test, predicted)
            print(mae)

            # Log into MLflow
            # self.log_model_into_mlflow(model_report,parameters,y_test,predicted,"Full Data Model")

            return model_report

        except Exception as e:
            raise CustomException(e, sys)

    def initiate_model_trainer_rent(self):
        DF = pd.read_csv(self.config.training_data)


        imp_feature = ['propertyType',
                       'locality',
                       'furnishing',
                       'city',
                       'bedrooms',
                       'bathrooms',
                       'RentOrSale',]
        
        DF = DF[DF["RentOrSale"] == 1]


        DF = remove_outliers_iqr(DF)
        try:
            logging.info("Split training and test input data")
            X_train, X_test, y_train, y_test = train_test_split(
                DF[imp_feature], DF[["exactPrice"]], test_size=0.33, random_state=42)
            
            print(y_train)
            # To convert y_train to required format
            y_train = np.ravel(y_train)
            print(y_train)

            models = {
                "RandomForest": RandomForestRegressor(),
                "DecisionTree": DecisionTreeRegressor(),
                "GradientBoosting": GradientBoostingRegressor(),
                "LinearRegression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoostRegressor": CatBoostRegressor(),

                "AdaBoostRegressor": AdaBoostRegressor(),
            }
            print("OKK 1")

            params = self.config.model_params
            print(params.MODELS)
            print("OKK 2")
            models = {key: value for key, value in models.items() if key in params.MODELS}
            print(models)

            print("OKK 3")


            model_report,parameters = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                                                 models=models, param=params.MODELS)
            print("OKK 4")
            logging.info(f"models -> {model_report}")


            
        
        
            # To get best model score from dict
            best_model_score = max(sorted(model_report.values()))

            # To get best model name from dict

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            best_model = models[best_model_name]

            if best_model_score < 0.3:
                raise CustomException("No best model found")
            logging.info(
                f"Best found model on both training and testing dataset")

            save_object(
                file_path=self.config.trained_model_file_path_rent,
                obj=best_model
            )

            predicted = best_model.predict(X_test)

            r2_square = r2_score(y_test, predicted)
            mae = mean_absolute_error(y_test, predicted)
            print(mae)


            # self.log_model_into_mlflow(model_report,parameters,y_test,predicted,"Rent Data Model")

            return model_report

        except Exception as e:
            raise CustomException(e, sys)

    
    
    def log_model_into_mlflow(self, model_report,parameters,y_test,predicted,run_name):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run(run_name=run_name):
            for model_name, model in model_report.items():
                mlflow.log_params(parameters)  # Log model parameters
                mlflow.log_metric("mean_squared_error", mean_squared_error(y_test, predicted))  # Log MSE
                mlflow.log_metric("mean_absolute_error", mean_absolute_error(y_test, predicted))  # Log MSE
                mlflow.log_metric("r2_score", r2_score(y_test, predicted))  # Log R2 Score
                
                # Model registry does not work with file store
                if tracking_url_type_store != "file":
                    # Register the model
                    # There are other ways to use the Model Registry, which depends on the use case,
                    # please refer to the doc for more information:
                    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                    mlflow.sklearn.log_model(model, f"{model_name}_model", registered_model_name=model_name)
                else:
                    mlflow.sklearn.log_model(model, f"{model_name}_model")

    

In [27]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTrainer(config=model_training_config)
    model_training.initiate_model_trainer()
    model_training.initiate_model_trainer_rent()
except Exception as e:
    raise e


[2024-03-18 13:11:54,732: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-18 13:11:54,805: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-18 13:11:54,811: INFO: common: created directory at: artifacts]
[2024-03-18 13:11:54,815: INFO: common: created directory at: artifacts/training]
[2024-03-18 13:11:54,921: INFO: 614572267: Split training and test input data]
       exactPrice
9614      12000.0
1347      24000.0
22355   8700000.0
8487      15000.0
15059     15000.0
...           ...
13021     25000.0
13955      6500.0
6089       6500.0
991        8000.0
18495   2580000.0

[14051 rows x 1 columns]
[1.20e+04 2.40e+04 8.70e+06 ... 6.50e+03 8.00e+03 2.58e+06]
OKK 1
{'DecisionTree': {'criterion': ['squared_error', 'friedman_mse'], 'splitter': ['best', 'random']}, 'CatBoostRegressor': {'verbose': [False]}}
OKK 2
{'DecisionTree': DecisionTreeRegressor(), 'CatBoostRegressor': <catboost.core.CatBoostRegressor object at 0x0000021B248807F0>}
OKK 3
{'

In [28]:
os.environ['MLFLOW_TRACKING_URI']='https://dagshub.com/Rajarshi12321/My-Sweet-Home.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME']='Rajarshi12321'
os.environ['MLFLOW_TRACKING_PASSWORD']='ba0cfe97e529787e678d28321906247cfce4fb43'


In [29]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTrainer(config=model_training_config)
    model_training.initiate_model_trainer()
    model_training.initiate_model_trainer_rent()
except Exception as e:
    raise e


[2024-03-18 13:13:45,373: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-18 13:13:45,390: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-18 13:13:45,398: INFO: common: created directory at: artifacts]
[2024-03-18 13:13:45,402: INFO: common: created directory at: artifacts/training]


[2024-03-18 13:13:45,500: INFO: 614572267: Split training and test input data]
       exactPrice
9614      12000.0
1347      24000.0
22355   8700000.0
8487      15000.0
15059     15000.0
...           ...
13021     25000.0
13955      6500.0
6089       6500.0
991        8000.0
18495   2580000.0

[14051 rows x 1 columns]
[1.20e+04 2.40e+04 8.70e+06 ... 6.50e+03 8.00e+03 2.58e+06]
OKK 1
{'DecisionTree': {'criterion': ['squared_error', 'friedman_mse'], 'splitter': ['best', 'random']}, 'CatBoostRegressor': {'verbose': [False]}}
OKK 2
{'DecisionTree': DecisionTreeRegressor(), 'CatBoostRegressor': <catboost.core.CatBoostRegressor object at 0x0000021B24254190>}
OKK 3
{'DecisionTree': DecisionTreeRegressor(), 'CatBoostRegressor': <catboost.core.CatBoostRegressor object at 0x0000021B24254190>}
{'DecisionTree': {'criterion': ['squared_error', 'friedman_mse'], 'splitter': ['best', 'random']}, 'CatBoostRegressor': {'verbose': [False]}} utils
[2024-03-18 13:14:17,746: INFO: common: utils model train