In [20]:
from dataclasses import dataclass
from pathlib import Path

In [21]:
@dataclass
class DataValidationConfig:
    accepted_data_dir: Path
    rejected_data_dir: Path
    file_name: str

In [22]:
@dataclass
class DataValidationArtifact:
    accepted_file_path: Path
    rejected_dir: Path


In [23]:
from consumerComplaint.constants.training_pipeline_config import *

In [24]:
import os , sys
from datetime import datetime
from pathlib import Path
from consumerComplaint.constants.training_pipeline_config.data_validation import *
from consumerComplaint.constants.training_pipeline_config import *
from consumerComplaint.constants import TIMESTAMP
from consumerComplaint.constants import *
from consumerComplaint.logger import logger
from consumerComplaint.exception import ConsumerComplaintException
from consumerComplaint.entity.metadata_entity import DataIngestionMetadata
from consumerComplaint.entity.config_entity import DataIngestionConfig, TrainingPipelineConfig, DataValidationConfig



class FinanceConfig:
    def __init__(self, pipeline_name=PIPELINE_NAME, timestamp=TIMESTAMP):
        """
        Organization: iNeuron Intelligence Private Limited

        """
        self.timestamp = timestamp
        self.pipeline_name = pipeline_name
        self.pipeline_config = self.get_pipeline_config()

    def get_pipeline_config(self) -> TrainingPipelineConfig:
        """
        This function will provide pipeline config information


        returns > PipelineConfig = namedtuple("PipelineConfig", ["pipeline_name", "artifact_dir"])
        """
        try:
            artifact_dir = PIPELINE_ARTIFACT_DIR
            pipeline_config = TrainingPipelineConfig(pipeline_name=self.pipeline_name,
                                                     artifact_dir=artifact_dir)

            logger.info(f"Pipeline configuration: {pipeline_config}")

            return pipeline_config
        except Exception as e:
            raise ConsumerComplaintException(e, sys)


    def get_data_ingestion_config(self, 
                                  from_date=DATA_INGESTION_MIN_START_DATE, 
                                  to_date=None)-> DataIngestionConfig:
        try:
            min_start_date = datetime.strptime(DATA_INGESTION_MIN_START_DATE, "%Y-%m-%d")
            from_date_obj = datetime.strptime(from_date, "%Y-%m-%d")
            if from_date_obj < min_start_date:
                from_date = DATA_INGESTION_MIN_START_DATE
            if to_date is None:
                to_date = datetime.now().strftime("%Y-%m-%d")

            """
            master directory for data ingestion
            we will store metadata information and ingested file to avoid redundant download
            """
            data_ingestion_master_dir = Path(self.pipeline_config.artifact_dir) / DATA_INGESTION_DIR

            # Create a time-based directory for each run
            data_ingestion_dir = data_ingestion_master_dir / self.timestamp

            metadata_file_path = data_ingestion_master_dir / DATA_INGESTION_METADATA_FILE_NAME

            data_ingestion_metadata = DataIngestionMetadata(metadata_file_path=metadata_file_path)

            if data_ingestion_metadata.is_metadata_file_present:
                metadata_info = data_ingestion_metadata.get_metadata_info()
                from_date = metadata_info.to_date

            data_ingestion_config = DataIngestionConfig(
                from_date=from_date,
                to_date=to_date,
                data_ingestion_dir=data_ingestion_dir,
                download_dir=os.path.join(data_ingestion_dir, DATA_INGESTION_DOWNLOADED_DATA_DIR),
                file_name=DATA_INGESTION_FILE_NAME,
                feature_store_dir=os.path.join(data_ingestion_master_dir, DATA_INGESTION_FEATURE_STORE_DIR),
                failed_dir=os.path.join(data_ingestion_dir, DATA_INGESTION_FAILED_DIR),
                metadata_file_path=metadata_file_path,
                datasource_url=DATA_INGESTION_DATA_SOURCE_URL

            )
            logger.info(f"Data ingestion config: {data_ingestion_config}")
            return data_ingestion_config

        except Exception as e:
            raise ConsumerComplaintException(e, sys)
        

    def get_data_validation_config(self) -> DataValidationConfig:
        """

        """
        try:
            
            data_validation_master_dir = Path(self.pipeline_config.artifact_dir) / DATA_VALIDATION_DIR
            data_validation_dir = data_validation_master_dir / self.timestamp

            accepted_data_dir = Path(data_validation_dir) / DATA_VALIDATION_ACCEPTED_DATA_DIR
            
            rejected_data_dir = Path(data_validation_dir) /DATA_VALIDATION_REJECTED_DATA_DIR

            data_preprocessing_config = DataValidationConfig(
                accepted_data_dir=accepted_data_dir,
                rejected_data_dir=rejected_data_dir,
                file_name=DATA_VALIDATION_FILE_NAME
            )

            logger.info(f"Data preprocessing config: {data_preprocessing_config}")

            return data_preprocessing_config
        except Exception as e:
            raise ConsumerComplaintException(e, sys)

In [25]:
import os, sys


In [26]:
config = FinanceConfig()
data_validation_config = config.get_data_validation_config()

In [27]:
data_validation_config

DataValidationConfig(accepted_data_dir=PosixPath('/home/suyodhan/Documents/Data-Science-Project/Consumer-Complaint-Dispute-Prediction/research/consumer_artifact/data_validation/20230923_143441/accepted_data'), rejected_data_dir=PosixPath('/home/suyodhan/Documents/Data-Science-Project/Consumer-Complaint-Dispute-Prediction/research/consumer_artifact/data_validation/20230923_143441/rejected_data'), file_name='consumer_complaint')

In [54]:
import os
import sys
from collections import namedtuple
from typing import List, Dict

from pyspark.sql import DataFrame
from pyspark.sql.functions import col



from consumerComplaint.config.spark_manager import spark_session
from consumerComplaint.entity.artifact_entity import DataIngestionArtifact
from consumerComplaint.entity.config_entity import DataValidationConfig
from consumerComplaint.entity.schema import FinanceDataSchema
from consumerComplaint.exception import ConsumerComplaintException
from consumerComplaint.logger import logger

from pyspark.sql.functions import lit
from consumerComplaint.entity.artifact_entity import DataValidationArtifact

COMPLAINT_TABLE = "complaint"
ERROR_MESSAGE = "error_msg"
MissingReport = namedtuple("MissingReport", ["total_row", "missing_row", "missing_percentage"])


class DataValidation(FinanceDataSchema):

    def __init__(self,
                 data_validation_config: DataValidationConfig,
                 data_ingestion_artifact: DataIngestionArtifact,
                 table_name: str = COMPLAINT_TABLE,
                 schema=FinanceDataSchema()
                 ):
        try:
            super().__init__()
            self.data_ingestion_artifact: DataIngestionArtifact = data_ingestion_artifact
            self.data_validation_config = data_validation_config
            self.table_name = table_name
            self.schema = schema
        except Exception as e:
            raise ConsumerComplaintException(e, sys) from e

    def read_data(self) -> DataFrame:
        try:
            dataframe: DataFrame = spark_session.read.parquet(
                self.data_ingestion_artifact.feature_store_file_path
            ).limit(10000)
            logger.info(f"Data frame is created using file: {self.data_ingestion_artifact.feature_store_file_path}")
            logger.info(f"Number of row: {dataframe.count()} and column: {len(dataframe.columns)}")
            #dataframe, _ = dataframe.randomSplit([0.001, 0.999])
            return dataframe
        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    @staticmethod
    def get_missing_report(dataframe: DataFrame, ) -> Dict[str, MissingReport]:
        try:
            missing_report: Dict[str:MissingReport] = dict()
            logger.info(f"Preparing missing reports for each column")
            number_of_row = dataframe.count()

            for column in dataframe.columns:
                missing_row = dataframe.filter(f"{column} is null").count()
                missing_percentage = (missing_row * 100) / number_of_row
                missing_report[column] = MissingReport(total_row=number_of_row,
                                                       missing_row=missing_row,
                                                       missing_percentage=missing_percentage
                                                       )
            logger.info(f"Missing report prepared: {missing_report}")
            return missing_report

        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    def get_unwanted_and_high_missing_value_columns(self, dataframe: DataFrame, threshold: float = 0.2) -> List[str]:
        try:
            missing_report: Dict[str, MissingReport] = self.get_missing_report(dataframe=dataframe)

            unwanted_column: List[str] = self.schema.unwanted_columns
            for column in missing_report:
                if missing_report[column].missing_percentage > (threshold * 100):
                    unwanted_column.append(column)
                    logger.info(f"Missing report {column}: [{missing_report[column]}]")
            unwanted_column = list(set(unwanted_column))
            return unwanted_column

        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    def drop_unwanted_columns(self, dataframe: DataFrame) -> DataFrame:
        try:
            unwanted_columns: List = self.get_unwanted_and_high_missing_value_columns(dataframe=dataframe, )
            logger.info(f"Dropping feature: {','.join(unwanted_columns)}")
            unwanted_dataframe: DataFrame = dataframe.select(unwanted_columns)

            unwanted_dataframe = unwanted_dataframe.withColumn(ERROR_MESSAGE, lit("Contains many missing values"))

            rejected_dir = Path(self.data_validation_config.rejected_data_dir) / "missing_data"
            rejected_dir.mkdir(exist_ok=True)
            file_path =Path(rejected_dir) / self.data_validation_config.file_name

            logger.info(f"Writing dropped column into file: [{file_path}]")
            unwanted_dataframe.write.mode("append").parquet(file_path)
            dataframe: DataFrame = dataframe.drop(*unwanted_columns)
            logger.info(f"Remaining number of columns: [{dataframe.columns}]")
            return dataframe
        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    @staticmethod
    def get_unique_values_of_each_column(dataframe: DataFrame) -> None:
        try:
            for column in dataframe.columns:
                n_unique: int = dataframe.select(col(column)).distinct().count()
                n_missing: int = dataframe.filter(col(column).isNull()).count()
                missing_percentage: float = (n_missing * 100) / dataframe.count()
                logger.info(f"Column: {column} contains {n_unique} value and missing perc: {missing_percentage} %.")
        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    def is_required_columns_exist(self, dataframe: DataFrame):
        try:
            columns = list(filter(lambda x: x in self.schema.required_columns,
                                  dataframe.columns))

            if len(columns) != len(self.schema.required_columns):
                raise Exception(f"Required column missing\n\
                 Expected columns: {self.schema.required_columns}\n\
                 Found columns: {columns}\
                 ")

        except Exception as e:
            raise ConsumerComplaintException(e, sys)

    # def drop_row_without_target_label(self, dataframe: DataFrame) -> DataFrame:
    #     try:
    #         dropped_rows = "dropped_row"
    #         total_rows: int = dataframe.count()
    #         logger.info(f"Number of row: {total_rows} ")
    #
    #         # Drop row if target value is unknown
    #         logger.info(f"Dropping rows without target value.")
    #         unlabelled_dataframe: DataFrame = dataframe.filter(f"{self.target_column}== 'N/A'")
    #
    #         rejected_dir = os.path.join(self.data_validation_config.rejected_data_dir, dropped_rows)
    #         os.makedirs(rejected_dir, exist_ok=True)
    #         file_path = os.path.join(rejected_dir, self.data_validation_config.file_name)
    #
    #         unlabelled_dataframe = unlabelled_dataframe.withColumn(ERROR_MESSAGE, lit("Dropped row as target label is "
    #                                                                                   "unknown"))
    #
    #         logger.info(f"Unlabelled data has row: [{unlabelled_dataframe.count()}] and columns:"
    #                     f" [{len(unlabelled_dataframe.columns)}]")
    #
    #         logger.info(f"Write unlabelled data into rejected file path: [{file_path}]")
    #         unlabelled_dataframe.write.mode("append").parquet(file_path)
    #
    #         dataframe: DataFrame = dataframe.filter(f"{self.target_column}!= 'N/A'")
    #
    #         logger.info(f"Remaining data has rows: [{dataframe.count()}] and columns: [{len(dataframe.columns)}]")
    #         return dataframe
    #     except Exception as e:
    #         raise ConsumerComplaintException(e, sys)

    def initiate_data_validation(self) -> DataValidationArtifact:
        try:
            logger.info(f"Initiating data preprocessing.")
            dataframe: DataFrame = self.read_data()
            # dataframe = self.drop_row_without_target_label(dataframe=dataframe)

            logger.info(f"Dropping unwanted columns")
            dataframe: DataFrame = self.drop_unwanted_columns(dataframe=dataframe)

            print(f"dataframe length: {len(dataframe)}")

            # validation to ensure that all require column available
            self.is_required_columns_exist(dataframe=dataframe)

            logger.info("Saving preprocessed data.")
            print(f"Row: [{dataframe.count()}] Column: [{len(dataframe.columns)}]")
            print(f"Expected Column: {self.required_columns}\nPresent Columns: {dataframe.columns}")

            # Create the accepted data directory if it doesn't exist
            accepted_dir = Path(self.data_validation_config.accepted_data_dir)
            accepted_dir.mkdir(parents=True, exist_ok=True)

            # Create the accepted file path using Path
            accepted_file_path = accepted_dir / self.data_validation_config.file_name

            dataframe.write.parquet(accepted_file_path)

            artifact = DataValidationArtifact(accepted_file_path=accepted_file_path,
                                              rejected_dir=self.data_validation_config.rejected_data_dir
                                              )
            logger.info(f"Data validation artifact: [{artifact}]")
            return artifact
        except Exception as e:
            raise ConsumerComplaintException(e, sys)


In [10]:
from consumerComplaint.components.training.data_ingestion import DataIngestion

In [55]:
config = FinanceConfig()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(data_ingestion_config=data_ingestion_config)
data_ingestion_artifact = data_ingestion.initiate_data_ingestion()

ConsumerComplaintException: <module 'sys' (built-in)>

In [29]:
# config = FinanceConfig()
# data_ingestion_artifact = ]()e
data_validation_config = config.get_data_validation_config()
data_validation = DataValidation(data_ingestion_artifact=data_ingestion_artifact, data_validation_config=data_validation_config, )
data_validation.initiate_data_validation()


NameError: name 'data_ingestion_artifact' is not defined

In [30]:
dataframe = spark_session.read.parquet(
                "/home/suyodhan/Documents/Data-Science-Project/Consumer-Complaint-Dispute-Prediction/consumer_artifact/data_ingestion/feature_store/consumer_complaint"
            ).limit(10000)

In [31]:
dataframe_10 = dataframe.limit(10)

In [32]:
dataframe.show()

+--------------------+-----------------------+----------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+-------------+----+------+--------+
|             company|company_public_response|company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|       date_received|date_sent_to_company|               issue|             product|state|           sub_issue|         sub_product|submitted_via|tags|timely|zip_code|
+--------------------+-----------------------+----------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+-------------+----+------+--------+
|TRANSUNION INTERM...|                   NULL|     In progress| 

In [33]:
@dataclass
class MissingReport:
    total_row : str
    missing_row: str
    missing_percentage: str

In [34]:
# MIssing REport
def get_missing_report(dataframe,):
    missing_report = dict()
    # logger.info(f"Preparing missing reports for each column")
    number_of_row = dataframe.count()

    for column in dataframe.columns:
        missing_row = dataframe.filter(f"{column} is null").count()
        missing_percentage = (missing_row * 100) / number_of_row
        missing_report[column] = MissingReport(total_row=number_of_row,
                                                missing_row=missing_row,
                                                missing_percentage=missing_percentage
                                                )
    # logger.info(f"Missing report prepared: {missing_report}")
    return missing_report

In [35]:
print(get_missing_report(dataframe=dataframe))

{'company': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'company_public_response': MissingReport(total_row=10000, missing_row=9702, missing_percentage=97.02), 'company_response': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'complaint_id': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'complaint_what_happened': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'consumer_consent_provided': MissingReport(total_row=10000, missing_row=8789, missing_percentage=87.89), 'consumer_disputed': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'date_received': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'date_sent_to_company': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'issue': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.0), 'product': MissingReport(total_row=10000, missing_row=0, missing_percentage=0.

In [36]:
from consumerComplaint.config.spark_manager import spark_session

## SCHEMA 

In [37]:



from typing import List
from pyspark.sql.types import TimestampType, StringType, StructType, StructField
from dataclasses import dataclass
from consumerComplaint.exception import ConsumerComplaintException
import os, sys


@dataclass
class FeatureProperties:
    col_name: str
    data_type: str

class FinanceDataSchema:

    def __init__(self):
        self.features = [
            FeatureProperties('company_response', 'string'),
            FeatureProperties('consumer_consent_provided', 'string'),
            FeatureProperties('submitted_via', 'string'),
            FeatureProperties('timely', 'string'),
            FeatureProperties('date_sent_to_company', 'timestamp'),
            FeatureProperties('date_received', 'timestamp'),
            FeatureProperties('company', 'string'),
            FeatureProperties('issue', 'string'),
            FeatureProperties('product', 'string'),
            FeatureProperties('state', 'string'),
            FeatureProperties('zip_code', 'string'),
            FeatureProperties('consumer_disputed', 'string')
        ]

    @property
    def dataframe_schema(self) -> StructType:
        try:
            schema = StructType([
                StructField(feature.col_name, TimestampType() if feature.data_type == 'timestamp' else StringType())
                for feature in self.features
            ])
            return schema
        except Exception as e:
            raise ConsumerComplaintException(e, sys) from e
        
    @property
    def target_column(self) -> str:
        return 'consumer_disputed'

    @property
    def one_hot_encoding_features(self) -> List[str]:
        return ['company_response', 
                'consumer_consent_provided', 
                'submitted_via']

    @property
    def im_one_hot_encoding_features(self) -> List[str]:
        return [f"im_{col}" for col in self.one_hot_encoding_features]

    @property
    def string_indexer_one_hot_features(self) -> List[str]:
        return [f"si_{col}" for col in self.one_hot_encoding_features]

    @property
    def tf_one_hot_encoding_features(self) -> List[str]:
        return [f"tf_{col}" for col in self.one_hot_encoding_features]

    @property
    def tfidf_features(self) -> List[str]:
        return "issue"

    @property
    def derived_input_features(self) -> List[str]:
        features = [
            "date_sent_to_company",
             "date_received"
        ]
        return features

    @property
    def derived_output_features(self) -> List[str]:
        return ["diff_in_days"]

    @property
    def numerical_columns(self) -> List[str]:
        return self.derived_output_features

    @property
    def im_numerical_columns(self) -> List[str]:
        return [f"im_{col}" for col in self.numerical_columns]

    @property
    def tfidf_feature(self) -> List[str]:
        return ["issue"]

    @property
    def tf_tfidf_features(self) -> List[str]:
        return [f"tf_{col}" for col in self.tfidf_feature]

    @property
    def input_features(self) -> List[str]:
        in_features = self.tf_one_hot_encoding_features + self.im_numerical_columns + self.tf_tfidf_features
        return in_features

    @property
    def required_columns(self) -> List[str]:
        features = [self.target_column] + self.one_hot_encoding_features + self.tfidf_features + \
                   ["date_sent_to_company", "date_received"]
        return features

    @property
    def required_prediction_columns(self) -> List[str]:
        features =  self.one_hot_encoding_features + self.tfidf_features + \
                   ["date_sent_to_company", "date_received"]
        return features



    @property
    def unwanted_columns(self) -> List[str]:
        features = ["complaint_id",
                    "sub_product",  
                    "complaint_what_happened"]

        return features

    @property
    def vector_assembler_output(self) -> str:
        return "va_input_features"

    @property
    def scaled_vector_input_features(self) -> str:
        return "scaled_input_features"

    @property
    def target_indexed_label(self) -> str:
        return f"indexed_{self.target_column}"

    @property
    def prediction_column_name(self) -> str:
        return "prediction"

    @property
    def prediction_label_column_name(self) -> str:
        return f"{self.prediction_column_name}_{self.target_column}"


In [38]:
finance_schema = FinanceDataSchema()

In [39]:
finance_schema.dataframe_schema

StructType(List(StructField(company_response,StringType,true),StructField(consumer_consent_provided,StringType,true),StructField(submitted_via,StringType,true),StructField(timely,StringType,true),StructField(date_sent_to_company,TimestampType,true),StructField(date_received,TimestampType,true),StructField(company,StringType,true),StructField(issue,StringType,true),StructField(product,StringType,true),StructField(state,StringType,true),StructField(zip_code,StringType,true),StructField(consumer_disputed,StringType,true)))

## Altrnative SCHEMA

In [40]:
from typing import List
from pyspark.sql.types import TimestampType, StringType, FloatType, StructType, StructField
from  consumerComplaint.exception import ConsumerComplaintException
import os, sys

from pyspark.sql import DataFrame
from typing import Dict


class FinanceDataSchema:

    def __init__(self):
        self.col_company_response: str = 'company_response'
        self.col_consumer_consent_provided: str = 'consumer_consent_provided'
        self.col_submitted_via = 'submitted_via'
        self.col_timely: str = 'timely'
        self.col_diff_in_days: str = 'diff_in_days'
        self.col_company: str = 'company'
        self.col_issue: str = 'issue'
        self.col_product: str = 'product'
        self.col_state: str = 'state'
        self.col_zip_code: str = 'zip_code'
        self.col_consumer_disputed: str = 'consumer_disputed'
        self.col_date_sent_to_company: str = "date_sent_to_company"
        self.col_date_received: str = "date_received"
        self.col_complaint_id: str = "complaint_id"
        self.col_sub_product: str = "sub_product"
        self.col_complaint_what_happened: str = "complaint_what_happened"
        self.col_company_public_response: str = "company_public_response"

    @property
    def dataframe_schema(self) -> StructType:
        try:
            schema = StructType([
                StructField(self.col_company_response, StringType()),
                StructField(self.col_consumer_consent_provided, StringType()),
                StructField(self.col_submitted_via, StringType()),
                StructField(self.col_timely, StringType()),
                StructField(self.col_date_sent_to_company, TimestampType()),
                StructField(self.col_date_received, TimestampType()),
                StructField(self.col_company, StringType()),
                StructField(self.col_issue, StringType()),
                StructField(self.col_product, StringType()),
                StructField(self.col_state, StringType()),
                StructField(self.col_zip_code, StringType()),
                StructField(self.col_consumer_disputed, StringType()),

            ])
            return schema

        except Exception as e:
            raise ConsumerComplaintException(e, sys) from e


In [41]:
d = FinanceDataSchema()
schema = d.dataframe_schema  # Correct way to access the property


In [42]:
schema

StructType(List(StructField(company_response,StringType,true),StructField(consumer_consent_provided,StringType,true),StructField(submitted_via,StringType,true),StructField(timely,StringType,true),StructField(date_sent_to_company,TimestampType,true),StructField(date_received,TimestampType,true),StructField(company,StringType,true),StructField(issue,StringType,true),StructField(product,StringType,true),StructField(state,StringType,true),StructField(zip_code,StringType,true),StructField(consumer_disputed,StringType,true)))

In [43]:
if "StructType(List(StructField(company_response,StringType,true),StructField(consumer_consent_provided,StringType,true),StructField(submitted_via,StringType,true),StructField(timely,StringType,true),StructField(date_sent_to_company,TimestampType,true),StructField(date_received,TimestampType,true),StructField(company,StringType,true),StructField(issue,StringType,true),StructField(product,StringType,true),StructField(state,StringType,true),StructField(zip_code,StringType,true),StructField(consumer_disputed,StringType,true)))" == "StructType(List(StructField(company_response,StringType,true),StructField(consumer_consent_provided,StringType,true),StructField(submitted_via,StringType,true),StructField(timely,StringType,true),StructField(date_sent_to_company,TimestampType,true),StructField(date_received,TimestampType,true),StructField(company,StringType,true),StructField(issue,StringType,true),StructField(product,StringType,true),StructField(state,StringType,true),StructField(zip_code,StringType,true),StructField(consumer_disputed,StringType,true)))":
    print("good")

good


In [44]:
dataframe: DataFrame = spark_session.read.parquet(
    "/home/suyodhan/Documents/Data-Science-Project/Consumer-Complaint-Dispute-Prediction/consumer_artifact/data_ingestion/feature_store/consumer_complaint"
).limit(10000)


In [45]:
dataframe.show()

+--------------------+-----------------------+----------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+-------------+----+------+--------+
|             company|company_public_response|company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|       date_received|date_sent_to_company|               issue|             product|state|           sub_issue|         sub_product|submitted_via|tags|timely|zip_code|
+--------------------+-----------------------+----------------+------------+-----------------------+-------------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+-------------+----+------+--------+
|TRANSUNION INTERM...|                   NULL|     In progress| 

In [50]:
from consumerComplaint.entity.schema import FinanceDataSchema

In [51]:
schema = FinanceDataSchema()

In [53]:
schema.required_columns

TypeError: can only concatenate list (not "str") to list

In [18]:
a = ['consumer_disputed'] + ['company_response', 
                'consumer_consent_provided', 
                'submitted_via'] + ["issue"] + ["date_sent_to_company", "date_received"]

In [19]:
a

['consumer_disputed',
 'company_response',
 'consumer_consent_provided',
 'submitted_via',
 'issue',
 'date_sent_to_company',
 'date_received']