In [1]:
import os

In [2]:
%pwd

'/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint'

## config_entity.py

In [5]:
from dataclasses import dataclass
from finance_complaint.entity.config_entity import TrainingPipelineConfig
from finance_complaint.entity.metadata_entity import DataIngestionMetadata
from finance_complaint.exception import FinanceException
from finance_complaint.logger import logging as logger
from finance_complaint.constant import TIMESTAMP
from datetime import datetime
import os, sys


Reading environment variables
Read Complete!


In [6]:
# Data Ingestion Constants
DATA_INGESTION_DIR = "data_ingestion"
DATA_INGESTION_DOWNLOADED_DATA_DIR = "downloaded_files"
DATA_INGESTION_FILE_NAME = "finance_complaint"
DATA_INGESTION_FEATURE_STORE_DIR = "feature_store"
DATA_INGESTION_FAILED_DIR = "failed_downloaded_files"
DATA_INGESTION_METADATA_FILE_NAME = "meta_info.yaml"
DATA_INGESTION_MIN_START_DATE = "2013-01-01"
DATA_INGESTION_DATA_SOURCE_URL = f"https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/"\
                                f"?date_received_max=<todate>&date_received_min=<fromdate>"\
                                f"&field=all&format=json"
# Data Validation Constants
DATA_VALIDATION_DIR = "data_validation"
DATA_VALIDATION_FILE_NAME = "finance_complaint"
DATA_VALIDATION_ACCEPTED_DATA_DIR = "accepted_data"
DATA_VALIDATION_REJECTED_DATA_DIR = "rejected_data"

# Data Transformation Constants
DATA_TRANSFORMATION_DIR = "data_transformation"
DATA_TRANSFORMATION_PIPELINE_DIR = "transformed_pipeline"
DATA_TRANSFORMATION_TRAIN_DIR = "train"
DATA_TRANSFORMATION_FILE_NAME = "finance_complaint"
DATA_TRANSFORMATION_TEST_DIR = "test"
DATA_TRANSFORMATION_TEST_SIZE = 0.3

# Model Trainer Constants
MODEL_TRAINER_BASE_ACCURACY = 0.6
MODEL_TRAINER_DIR = "model_trainer"
MODEL_TRAINER_TRAINED_MODEL_DIR = "trained_model"
MODEL_TRAINER_MODEL_NAME = "finance_estimator"
MODEL_TRAINER_LABEL_INDEXER_DIR = "label_indexer"
MODEL_TRAINER_MODEL_METRIC_NAMES = [
    'f1',
    "weightedPrecision",
    "weightedRecall",
    "weightedTruePositiveRate",
    "weightedFalsePositiveRate",
    "weightedFMeasure",
    "truePositiveRateByLabel",
    "falsePositiveRateByLabel",
    "precisionByLabel",
    "recallByLabel",
    "fMeasureByLabel"
]

# Model Evaluation Constants
MODEL_EVALUATION_DIR = "model_evaluation"
MODEL_EVALUATION_REPORT_DIR = "report"
MODEL_EVALUATION_REPORT_FILE_NAME = "evaluation_report"
MODEL_EVALUATION_THRESHOLD_VALUE = 0.002
MODEL_EVALUATION_METRIC_NAMES = ["f1",]

# Model Pusher Constants
MODEL_PUSHER_SAVED_MODEL_DIRS = "saved_models"
MODEL_PUSHER_DIR = "model_pusher"
MODEL_PUSHER_MODEL_NAME = MODEL_TRAINER_MODEL_NAME

In [7]:
@dataclass
class TrainingPipelineConfig:
    pipeline_name: str = "artifact"
    artifact_dir: str = os.path.join(pipeline_name, TIMESTAMP)

@dataclass
class DataIngestionConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig, from_date=DATA_INGESTION_MIN_START_DATE,
                 to_date='2020-01-01'):
        try:
            self.from_date = from_date
            min_start_date = datetime.strptime(DATA_INGESTION_MIN_START_DATE, "%Y-%m-%d")
            from_date_obj = datetime.strptime(from_date, "%Y-%m-%d")

            # date should not be smaller than minimum date
            if from_date_obj < min_start_date:
                self.from_date = DATA_INGESTION_MIN_START_DATE

            # to_date is none then consider todays date
            if to_date is None:
                self.to_date = datetime.now().strftime("%Y-%m-%d")
            else:
                self.to_date = to_date

            # create master directory
            data_ingestion_master_dir = os.path.join(os.path.dirname(training_pipeline_config.artifact_dir), DATA_INGESTION_DIR)

            self.data_ingestion_dir = os.path.join(data_ingestion_master_dir, TIMESTAMP)

            self.metadata_file_path = os.path.join(data_ingestion_master_dir, DATA_INGESTION_METADATA_FILE_NAME)

            data_ingestion_metadata = DataIngestionMetadata(metadata_file_path=self.metadata_file_path)

            if data_ingestion_metadata.is_metadata_file_present:
                metadata_info = data_ingestion_metadata.get_meta_data_info()
                self.from_date = metadata_info.to_date

            self.download_dir = os.path.join(self.data_ingestion_dir, DATA_INGESTION_DOWNLOADED_DATA_DIR)

            self.failed_dir = os.path.join(self.data_ingestion_dir, DATA_INGESTION_FAILED_DIR)

            self.file_name = DATA_INGESTION_FILE_NAME

            self.feature_store_dir = os.path.join(data_ingestion_master_dir, DATA_INGESTION_FEATURE_STORE_DIR)

            self.datasource_url = DATA_INGESTION_DATA_SOURCE_URL 
        except Exception as e:
            raise FinanceException(e, sys)
        
@dataclass
class DataValidationConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
        try:
            data_validation_dir = os.path.join(training_pipeline_config.artifact_dir, 
                                               DATA_VALIDATION_DIR)
            self.accepted_data_dir = os.path.join(data_validation_dir, DATA_VALIDATION_ACCEPTED_DATA_DIR)
            self.rejected_data_dir = os.path.join(data_validation_dir, DATA_VALIDATION_REJECTED_DATA_DIR)
            self.file_name = DATA_VALIDATION_FILE_NAME
        except Exception as e:
            raise FinanceException(e, sys)
        
@dataclass
class DataTransformationConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
        try:
            data_transformation_dir = os.path.join(training_pipeline_config.artifact_dir, DATA_TRANSFORMATION_DIR)
            self.transformed_train_dir = os.path.join(data_transformation_dir, DATA_TRANSFORMATION_TRAIN_DIR)
            self.transformed_test_dir = os.path.join(data_transformation_dir, DATA_TRANSFORMATION_TEST_DIR)
            self.export_pipeline_dir = os.path.join(data_transformation_dir, DATA_TRANSFORMATION_PIPELINE_DIR)
            self.file_name = DATA_TRANSFORMATION_FILE_NAME
            self.test_size = DATA_TRANSFORMATION_TEST_SIZE
        except Exception as e:
            raise FinanceException(e, sys)
        
@dataclass
class ModelTrainerConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
        try:
            model_trained_dir = os.path.join(training_pipeline_config.artifact_dir, MODEL_TRAINER_DIR)
            self.trained_model_file_path = os.path.join(model_trained_dir, MODEL_TRAINER_TRAINED_MODEL_DIR, MODEL_TRAINER_MODEL_NAME)
            self.label_indexer_model_dir = os.path.join(model_trained_dir, MODEL_TRAINER_LABEL_INDEXER_DIR)
            self.base_accuracy = MODEL_TRAINER_BASE_ACCURACY
            self.metric_list = MODEL_TRAINER_MODEL_METRIC_NAMES
        except Exception as e:
            raise FinanceException(e, sys)
    
@dataclass
class ModelEvaluationConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
        self.model_evaluation_dir = os.path.join(training_pipeline_config.artifact_dir, MODEL_EVALUATION_DIR)
        self.threshold=MODEL_EVALUATION_THRESHOLD_VALUE
        self.metric_list = MODEL_EVALUATION_METRIC_NAMES

@dataclass
class ModelPusherConfig:

    def __init__(self, training_pipeline_config: TrainingPipelineConfig) :
        try:
            self.pusher_model_dir = os.path.join(training_pipeline_config.artifact_dir, MODEL_PUSHER_DIR, "model", MODEL_PUSHER_MODEL_NAME)
            self.saved_model_dir = MODEL_PUSHER_SAVED_MODEL_DIRS
        except Exception as e:
            raise FinanceException(e, sys)
        
class BatchPredictionConfig:

    def __init__(self):
        try:
            self.inbox_dir = os.path.join("data", "inbox")
            self.outbox_dir = os.path.join("data", "outbox")
            self.archive_dir = os.path.join("data", "archive")
            # os.makedirs(self.inbox_dir , exist_ok=True)
            os.makedirs(self.outbox_dir , exist_ok=True)
            os.makedirs(self.archive_dir, exist_ok=True)
        except Exception as e:
            raise FinanceException(e, sys)

## artifact_entity.py

In [8]:
from dataclasses import dataclass
from datetime import datetime

In [10]:
class ModelEvaluationArtifact:

    def __init__(self, 
                 model_accepted, 
                 changed_accuracy, 
                 trained_model_path,
                 best_model_path,
                 active,
                 *args,
                 **kwargs):
        self.model_accepted = model_accepted
        self.changed_accuracy = changed_accuracy
        self.trained_model_path = trained_model_path
        self.best_model_path = best_model_path
        self.active = active
        self.create_timestamp = datetime.now()

    def to_dict(self):
        return self.__dict__
    
    def __str__(self):
        return str(self.to_dict())
    
@dataclass
class ModelPusherArtifact:
    model_pushed_dir: str
    saved_model_dir: str

## model evaluation step by step

In [None]:
# from finance_complaint.entity.artifact_entity import ModelEvaluationArtifact, DataValidationArtifact, ModelTrainerArtifact
# from finance_complaint.entity.schema import FinanceDataSchema
# from finance_complaint.ml.estimator import  ModelResolver, FinanceComplaintEstimator

Reading environment variables
Read Complete!


In [11]:
# MODEL_SAVED_DIR = "saved_models"
# MODEL_NAME = "finance_estimator"

In [12]:
# model_eval_artifact_data = ModelEvaluationArtifact(
#                  model_accepted = , 
#                  changed_accuracy = , 
#                  trained_model_path = ,
#                  best_model_path = ,
#                  active = )

# data_validation_artifact = DataValidationArtifact()

# model_eval_config = ModelEvaluationConfig()


# schem = FinanceDataSchema()


# finance_estimator = FinanceComplaintEstimator()

In [13]:
# model_trainer_artifact = ModelTrainerArtifact(
#                  model_trainer_ref_artifact = PartialModelTrainerRefArtifact(),
#                  model_trainer_train_metric_artifact = PartialModelTrainerMetricArtifact(),
#                  model_trainer_test_metric_artifact = PartialModelTrainerMetricArtifact())

# model_resolver = ModelResolver(MODEL_SAVED_DIR, MODEL_NAME)

In [14]:
# if not model_resolver.is_model_present:
#     model_evaluation_artifact = ModelEvaluationArtifact(
#         model_accepted=True,
#         changed_accuracy=None,
#         trained_model_path= model_trainer_artifact.model_trainer_ref_artifact.trained_model_file_path,
#         best_model_path=None,
#         active=True
#     )
    

In [15]:
# #set initial flag
# is_model_accepted, is_active = False, False

# #obtain required directory path
# trained_model_file_path = model_trainer_artifact.model_trainer_ref_artifact.trained_model_file_path
# label_indexer_model_path = model_trainer_artifact.model_trainer_ref_artifact.label_indexer_model_file_path

In [16]:
# #load required model and label index
# label_indexer_model = StringIndexerModel.load(label_indexer_model_path)
# trained_model = PipelineModel.load(trained_model_file_path)

In [17]:
    # def evaluate_trained_model(self) -> ModelEvaluationArtifact:
    #     try:

            


    #         #Read the dataframe
    #         dataframe: DataFrame = self.read_data()
    #         dataframe = label_indexer_model

    #         best_model_path = self.model_resolver.get_best_model_path()


    #         best_model_dataframe = self.finance_estimator.transform(dataframe)


    #         #prediction using trained model
    #         trained_model_dataframe = trained_model.transform(dataframe)


    #         #compute f1 score for trained model
    #         trained_model_f1_score = get_score(dataframe=trained_model_dataframe, metric_name="f1",
    #                                         label_col=self.schema.target_indexed_label,
    #                                         prediction_col=self.schema.prediction_column_name)
    #         #compute f1 score for best model
    #         best_model_f1_score = get_score(dataframe=best_model_dataframe, metric_name="f1",
    #                                         label_col=self.schema.target_indexed_label,
    #                                         prediction_col=self.schema.prediction_column_name)


    #         logger.info(f"Trained_model_f1_score: {trained_model_f1_score}, Best model f1 score: {best_model_f1_score}")
    #         #improved accuracy
    #         changed_accuracy = trained_model_f1_score - best_model_f1_score


            
    #         if changed_accuracy >= self.model_eval_config.threshold:
    #             is_model_accepted, is_active = True, True
    #         model_evaluation_artifact = ModelEvaluationArtifact(model_accepted=is_model_accepted,
    #                                                             changed_accuracy=changed_accuracy,
    #                                                             trained_model_path=trained_model_file_path,
    #                                                             best_model_path=best_model_path,
    #                                                             active=is_active
    #                                                             )
    #         return model_evaluation_artifact
    #     except Exception as e:
    #         raise FinanceException(e,sys)



## model_evaluation.py

In [None]:
import os, sys
from pyspark.sql import DataFrame

from finance_complaint.exception import FinanceException
from finance_complaint.logger import logging as logger
from finance_complaint.entity.schema import FinanceDataSchema

from finance_complaint.entity.artifact_entity import (
                                                    #   ModelEvaluationArtifact, 
                                                      DataValidationArtifact, 
                                                      ModelTrainerArtifact)
# from finance_complaint.data_access.model_eval_artifact import ModelEvaluationArtifact#
#  from finance_complaint.entity.config_entity import ModelEvaluationConfig

from finance_complaint.ml.estimator import  ModelResolver, FinanceComplaintEstimator

from finance_complaint.config.spark_manager import spark_session
from finance_complaint.utils import get_score

from pyspark.ml.feature import StringIndexerModel
from pyspark.ml.pipeline import PipelineModel

MongoClient(host=['ac-zciqt62-shard-00-02.uhbxanv.mongodb.net:27017', 'ac-zciqt62-shard-00-00.uhbxanv.mongodb.net:27017', 'ac-zciqt62-shard-00-01.uhbxanv.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='mlcluster', authsource='admin', replicaset='atlas-10vcjq-shard-0', tls=True, server_api=<pymongo.server_api.ServerApi object at 0x11bf5c1a0>)


25/05/11 13:25:58 WARN Utils: Your hostname, Rahuls-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
25/05/11 13:25:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/11 13:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 62742)
Traceback (most recent call last):
  File "/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/venv/lib/python3.13/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
    ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/venv/lib/python3.13/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/venv/lib/python3.13/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
    ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rahulshelke/Documents/Data-Science/Data

In [19]:
class ModelEvaluation:

    def __init__(self, data_validation_artifact: DataValidationArtifact,
                 model_trainer_artifact: ModelTrainerArtifact,
                 model_eval_config: ModelEvaluationConfig,
                 schema = FinanceDataSchema()):
        try:
            self.model_eval_artifact_data = ModelEvaluationArtifact()
            self.data_validation_artifact = data_validation_artifact
            self.model_eval_config = model_eval_config
            self.model_trainer_artifact = model_trainer_artifact
            self.schem = schema
            self.model_resolver = ModelResolver()
            self.finance_estimator = FinanceComplaintEstimator()
        except Exception as e:
            raise e
        
    def read_data(self)-> DataFrame:
        try:
            file_path = self.data_validation_artifact.accepted_file_path
            dataframe: DataFrame = spark_session.read.parquet(file_path)
            return dataframe
        except Exception as e:
            raise e
        
    def evaluate_trained_model(self) -> ModelEvaluationArtifact:
        try:
            if not self.model_resolver.is_model_present:
                model_evaluation_artifact = ModelEvaluationArtifact(
                    model_accepted=True,
                    changed_accuracy=None,
                    trained_model_path=self.model_trainer_artifact.model_trainer_ref_artifact.trained_model_file_path,
                    best_model_path=None,
                    active=True
                )
                return model_evaluation_artifact
            
            #set initial flag
            is_model_accepted, is_active = False, False

            #obtain required directory path
            trained_model_file_path = self.model_trainer_artifact.model_trainer_ref_artifact.trained_model_file_path
            label_indexer_model_path = self.model_trainer_artifact.model_trainer_ref_artifact.label_indexer_model_file_path

            #load required model and label index
            label_indexer_model = StringIndexerModel.load(label_indexer_model_path)
            trained_model = PipelineModel.load(trained_model_file_path)

            #Read the dataframe
            dataframe: DataFrame = self.read_data()
            dataframe = label_indexer_model

            best_model_path = self.model_resolver.get_best_model_path()


            best_model_dataframe = self.finance_estimator.transform(dataframe)


            #prediction using trained model
            trained_model_dataframe = trained_model.transform(dataframe)


            #compute f1 score for trained model
            trained_model_f1_score = get_score(dataframe=trained_model_dataframe, metric_name="f1",
                                            label_col=self.schema.target_indexed_label,
                                            prediction_col=self.schema.prediction_column_name)
            #compute f1 score for best model
            best_model_f1_score = get_score(dataframe=best_model_dataframe, metric_name="f1",
                                            label_col=self.schema.target_indexed_label,
                                            prediction_col=self.schema.prediction_column_name)


            logger.info(f"Trained_model_f1_score: {trained_model_f1_score}, Best model f1 score: {best_model_f1_score}")
            #improved accuracy
            changed_accuracy = trained_model_f1_score - best_model_f1_score


            
            if changed_accuracy >= self.model_eval_config.threshold:
                is_model_accepted, is_active = True, True
            model_evaluation_artifact = ModelEvaluationArtifact(model_accepted=is_model_accepted,
                                                                changed_accuracy=changed_accuracy,
                                                                trained_model_path=trained_model_file_path,
                                                                best_model_path=best_model_path,
                                                                active=is_active
                                                                )
            return model_evaluation_artifact
        except Exception as e:
            raise FinanceException(e,sys)


    def initiate_model_evaluation(self) -> ModelEvaluationArtifact:
        try:
            model_accepted = True
            is_active = True
            model_evaluation_artifact = self.evaluate_trained_model()
            logger.info(f"Model evaluation artifact: {model_evaluation_artifact}")
            self.model_eval_artifact_data.save_eval_artifact(model_eval_artifact=model_evaluation_artifact)
            return model_evaluation_artifact
        except Exception as e:
            raise FinanceException(e, sys)



## training.py

In [20]:
from finance_complaint.exception import FinanceException
from finance_complaint.logger import logging as logger
from finance_complaint.entity.schema import FinanceDataSchema

from finance_complaint.entity.config_entity import (DataIngestionConfig, 
                                                    DataValidationConfig,
                                                    DataTransformationConfig,
                                                    ModelTrainerConfig)
from finance_complaint.entity.artifact_entity import (DataIngestionArtifact,
                                                      DataValidationArtifact,
                                                      DataTransformationArtifact,
                                                      ModelTrainerArtifact)

from finance_complaint.components.data_ingestion import DataIngestion
from finance_complaint.components.data_validation import DataValidation
from finance_complaint.components.data_transformation import DataTransformation
from finance_complaint.components.model_trainer import ModelTrainer

import sys, os


In [21]:
class TrainingPipeline:
    """ 
    this is a training pipeline comprising each components from flow chart
    """
    def __init__(self, training_pipeline_config: TrainingPipelineConfig):
        self.training_pipeline_config: TrainingPipelineConfig = training_pipeline_config

    def start_data_ingestion(self) -> DataIngestionArtifact:
        """
        responsible for starting data ingestion 
        """
        try:
            data_ingestion_config = DataIngestionConfig(training_pipeline_config=self.training_pipeline_config)
            data_ingestion = DataIngestion(data_ingestion_config=data_ingestion_config)
            data_ingestion_artifact = data_ingestion.initiate_data_ingestion()
            return data_ingestion_artifact
        
        except Exception as e:
            logger.debug(f"Error: {e}")
            raise FinanceException(e, sys)
        
    def start_data_validation(self, data_ingestion_artifact: DataIngestionArtifact) -> DataValidationArtifact:
        """
        takes ingested data artifact and validate that data by generating validation artifact
        """
        try:
            data_validation_config = DataValidationConfig(training_pipeline_config=self.training_pipeline_config)
            data_validation = DataValidation(
                data_validation_config=data_validation_config,
                data_ingestion_artifact=data_ingestion_artifact
            )
            data_validation_artifact = data_validation.initiate_data_validation()
            return data_validation_artifact
        except Exception as e:
            raise FinanceException(e, sys)
        
    def start_data_transformation(self, data_validation_artifact: DataValidationArtifact) -> DataTransformationArtifact:
        """ 
        responsible for data transformation 
        """
        try:
            data_transformation_config = DataTransformationConfig(training_pipeline_config=self.training_pipeline_config)
            data_transformation = DataTransformation(data_validation_artifact=data_validation_artifact,
                                                     data_transformation_config=data_transformation_config)
            
            data_transformation_artifacts = data_transformation.initiate_data_transformation()
            return data_transformation_artifacts
        except Exception as e:
            raise FinanceException(e, sys)
        
    def start_model_trainer(self, data_transformation_artifact: DataTransformationArtifact) -> ModelTrainerArtifact:
        try:
            model_trainer_config = ModelTrainerConfig(training_pipeline_config=self.training_pipeline_config)
            model_trainer = ModelTrainer(data_transformation_artifact=data_transformation_artifact,
                                         model_trainer_config=model_trainer_config)
            model_trainer_artifact = model_trainer.initiate_model_training()

            return model_trainer_artifact
        except Exception as e:
            raise FinanceException(e, sys)
        
    def start_model_evaluation(self, data_validation_artifact, model_trainer_artifact) -> ModelEvaluationArtifact:
        try:
            model_eval_config = ModelEvaluationConfig(training_pipeline_config=self.training_pipeline_config)
            model_eval = ModelEvaluation(data_validation_artifact=data_validation_artifact,
                                        model_trainer_artifact=model_trainer_artifact,
                                        model_eval_config=model_eval_config
                                        )
            return model_eval.initiate_model_evaluation()
        except Exception as e:
            raise FinanceException(e, sys)
        
    def start(self):
        try:
            # initalizating data ingestion
            data_ingestion_artifact = self.start_data_ingestion()
            # initalizing data validation
            data_validation_artifact = self.start_data_validation(data_ingestion_artifact=data_ingestion_artifact)
            # initalizing data transformation
            data_transformation_artifact = self.start_data_transformation(data_validation_artifact=data_validation_artifact)
            # initalizing model training
            model_trainer_artifact = self.start_model_trainer(data_transformation_artifact=data_transformation_artifact)
            # initalizing model evaluation
            model_eval_artifact = self.start_model_evaluation(data_validation_artifact=data_validation_artifact,
                                                              model_trainer_artifact=model_trainer_artifact
                                                              )

        except Exception as e:
            raise FinanceException(e, sys)    
        

## train.py

In [None]:
from finance_complaint.pipeline.training import TrainingPipeline
# from finance_complaint.entity.config_entity import TrainingPipelineConfig

if __name__ == "__main__":
    training_pipeline_config = TrainingPipelineConfig()
    training_pipeline = TrainingPipeline(training_pipeline_config=training_pipeline_config)
    training_pipeline.start()

                                                                                

Row: [10000] Column: [13]
Expected Column: ['consumer_disputed', 'company_response', 'consumer_consent_provided', 'submitted_via', 'issue', 'date_sent_to_company', 'date_received']
Present Columns: ['company', 'company_response', 'consumer_consent_provided', 'consumer_disputed', 'date_received', 'date_sent_to_company', 'issue', 'product', 'state', 'sub_issue', 'submitted_via', 'timely', 'zip_code']
root
 |-- company: string (nullable = true)
 |-- company_response: string (nullable = true)
 |-- consumer_consent_provided: string (nullable = true)
 |-- consumer_disputed: string (nullable = true)
 |-- date_received: string (nullable = true)
 |-- date_sent_to_company: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- product: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sub_issue: string (nullable = true)
 |-- submitted_via: string (nullable = true)
 |-- timely: string (nullable = true)
 |-- zip_code: string (nullable = true)

{'inputCols': ['company_r

                                                                                

6922 2
3078 2
Train row: 6922 Test row: 3078
Train row: 6922 Test row: 3078
number of row in trauning: 6922
f1 score: 1.0
weightedPrecision score: 1.0
weightedRecall score: 1.0
weightedTruePositiveRate score: 1.0
weightedFalsePositiveRate score: nan
weightedFMeasure score: 1.0
truePositiveRateByLabel score: 1.0
falsePositiveRateByLabel score: nan
precisionByLabel score: 1.0
recallByLabel score: 1.0
fMeasureByLabel score: 1.0
number of rows in testing: 3078
f1 score: 1.0
weightedPrecision score: 1.0
weightedRecall score: 1.0
weightedTruePositiveRate score: 1.0
weightedFalsePositiveRate score: nan
weightedFMeasure score: 1.0
truePositiveRateByLabel score: 1.0
falsePositiveRateByLabel score: nan
precisionByLabel score: 1.0
recallByLabel score: 1.0
fMeasureByLabel score: 1.0
{'inputCols': None, 'outputCols': None}
{'inputCols': None, 'outputCols': None}
{'inputCols': None, 'outputCols': None}


                                                                                

f1 score: 1.0


25/05/11 13:27:14 ERROR Executor: Exception in task 0.0 in stage 375.0 (TID 531)
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`StringIndexerModel$$Lambda$4343/144378244`: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	

FinanceException: Error occured in script: [ /Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/finance_complaint/pipeline/training.py ] at line number: [ 110 ] error message: [Error occured in script: [ /Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/finance_complaint/pipeline/training.py ] at line number: [ 83 ] error message: [Error occured in script: [ /Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/finance_complaint/components/model_evalutaion.py ] at line number: [ 111 ] error message: [Error occured in script: [ /Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/finance_complaint/components/model_evalutaion.py ] at line number: [ 81 ] error message: [Error occured in script: [ /Users/rahulshelke/Documents/Data-Science/Data-Science-Projects/finance_complaint/finance_complaint/utils.py ] at line number: [ 42 ] error message: [An error occurred while calling o3742.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 375.0 failed 1 times, most recent failure: Lost task 0.0 in stage 375.0 (TID 531) (192.168.1.5 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`StringIndexerModel$$Lambda$4343/144378244`: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:197)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Unseen label: Fax. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:406)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 22 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:738)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:737)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions$lzycompute(MulticlassMetrics.scala:61)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions(MulticlassMetrics.scala:52)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.labelCountByClass$lzycompute(MulticlassMetrics.scala:66)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.labelCountByClass(MulticlassMetrics.scala:64)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure(MulticlassMetrics.scala:227)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure$lzycompute(MulticlassMetrics.scala:235)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.weightedFMeasure(MulticlassMetrics.scala:235)
	at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator.evaluate(MulticlassClassificationEvaluator.scala:152)
	at sun.reflect.GeneratedMethodAccessor178.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`StringIndexerModel$$Lambda$4343/144378244`: (string) => double).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:197)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: Fax. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:406)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:391)
	... 22 more
] ] ] ] ] 

25/05/11 16:19:48 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 947713 ms exceeds timeout 120000 ms
25/05/11 16:19:48 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/11 16:35:12 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$