In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import (
    StructField
    , StringType
    , IntegerType
    , DoubleType
    , StructType
)

In [0]:
class DataPipeline:
    def __init__(
            self
            , spark:SparkSession
        ) -> None:
        self.spark = spark
        self.file_paths:dict = {
            'test.csv':"/FileStore/tables/test.csv"
            , 'train.csv':"/FileStore/tables/train.csv"
            , 'Customer_Churn_Records.csv':"/FileStore/tables/Customer_Churn_Records.csv"
            , 'Bank_Customer_Churn_Prediction.csv':"/FileStore/tables/Bank_Customer_Churn_Prediction.csv"
            , 'Churn_Modeling.csv':"/FileStore/tables/Churn_Modeling.csv"
            , 'Churn_Modelling.csv':"/FileStore/tables/Churn_Modelling.csv"
            , 'Churn_Modelling-1.csv':"/FileStore/tables/Churn_Modelling-1.csv"
            , 'churn.csv':"/FileStore/tables/churn.csv"
        }
        self.spark_dataframes = None
        self.joined_df = None
        self.target_df = None
    def run(self) -> None:
        self.extract()
        self.transform()
        self.load()
    def extract(self) -> None:
        self._extract()
    def transform(self) -> None:
        self._transform_create_target_table()
        self._transform_rename_columns()
        self._transform_remove_null_id()
        self._transform_assert_unique_id()
        self._transform_full_outer_join()
        self._transform_data_validation()
    def load(self) -> None:
        #save to train csv and test csv OR save to 1 csv and ML pipieline breaks into train-test split
        pass
    def _extract(self) -> None:
        self.spark_dataframes = dict()
        self.spark_dataframes['test.csv'] = self.spark.read.csv(
            self.file_paths['test.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('id', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
            ])
        )
        self.spark_dataframes['train.csv'] = self.spark.read.csv(
            self.file_paths['train.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('id', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
            ])
        )
        self.spark_dataframes['Customer_Churn_Records.csv'] = self.spark.read.csv(
            self.file_paths['Customer_Churn_Records.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('RowNumber', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
                , StructField('Complain', IntegerType(), True)
                , StructField('Satisfaction Score', IntegerType(), True)
                , StructField('Card Type', StringType(), True)
                , StructField('Point Earned', IntegerType(), True)
            ])
        )
        self.spark_dataframes['Bank_Customer_Churn_Prediction.csv'] = self.spark.read.csv(
            self.file_paths['Bank_Customer_Churn_Prediction.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('customer_id', IntegerType(), True)
                , StructField('credit_score', IntegerType(), True)
                , StructField('country', StringType(), True)
                , StructField('gender', StringType(), True)
                , StructField('age', IntegerType(), True)
                , StructField('tenure', IntegerType(), True)
                , StructField('balance', DoubleType(), True)
                , StructField('products_number', IntegerType(), True)
                , StructField('credit_card', IntegerType(), True)
                , StructField('active_member', IntegerType(), True)
                , StructField('estimated_salary', DoubleType(), True)
                , StructField('churn', IntegerType(), True)
            ])
        )
        self.spark_dataframes['Churn_Modeling.csv'] = self.spark.read.csv(
            self.file_paths['Churn_Modeling.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('RowNumber', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
            ])
        )
        self.spark_dataframes['Churn_Modelling.csv'] = self.spark.read.csv(
            self.file_paths['Churn_Modelling.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('RowNumber', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
            ])
        )
        self.spark_dataframes['Churn_Modelling-1.csv'] = self.spark.read.csv(
            self.file_paths['Churn_Modelling-1.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('RowNumber', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
            ])
        )
        self.spark_dataframes['churn.csv'] = self.spark.read.csv(
            self.file_paths['churn.csv']
            , header=True
            , schema=StructType(fields=[
                StructField('RowNumber', IntegerType(), True)
                , StructField('CustomerId', IntegerType(), True)
                , StructField('Surname', StringType(), True)
                , StructField('CreditScore', IntegerType(), True)
                , StructField('Geography', StringType(), True)
                , StructField('Gender', StringType(), True)
                , StructField('Age', IntegerType(), True)
                , StructField('Tenure', IntegerType(), True)
                , StructField('Balance', DoubleType(), True)
                , StructField('NumOfProducts', IntegerType(), True)
                , StructField('HasCrCard', IntegerType(), True)
                , StructField('IsActiveMember', IntegerType(), True)
                , StructField('EstimatedSalary', DoubleType(), True)
                , StructField('Exited', IntegerType(), True)
            ])
        )
    def _transform_create_target_table(self) -> None:
        self.target_df = spark.createDataFrame([], schema=StructType(fields=[
            StructField('customer_id', IntegerType(), True)
            , StructField('surname', StringType(), True)
            , StructField('credit_score', IntegerType(), True)
            , StructField('geography', StringType(), True)
            , StructField('gender', StringType(), True)
            , StructField('age', IntegerType(), True)
            , StructField('tenure', IntegerType(), True)
            , StructField('balance', DoubleType(), True)
            , StructField('product_count', IntegerType(), True)
            , StructField('has_creditcard', IntegerType(), True)
            , StructField('active_member', IntegerType(), True)
            , StructField('estimated_salary', DoubleType(), True)
            , StructField('complain', IntegerType(), True)
            , StructField('satisfaction_score', IntegerType(), True)
            , StructField('card_type', StringType(), True)
            , StructField('points_earned', IntegerType(), True)
            , StructField('churn', IntegerType(), True)
        ]))
    def _transform_rename_columns(self) -> None:
        new_column_names = [
            'id_test_csv'
            , 'customer_id'
            , 'surname_test_csv'
            , 'credit_score_test_csv'
            , 'geography_test_csv'
            , 'gender_test_csv'
            , 'age_test_csv'
            , 'tenure_test_csv'
            , 'balance_test_csv'
            , 'product_count_test_csv'
            , 'has_creditcard_test_csv'
            , 'active_member_test_csv'
            , 'estimated_salary_test_csv'
        ]
        self.spark_dataframes['test.csv'] = self.spark_dataframes['test.csv'].toDF(*new_column_names)
        new_column_names = [
            'id_train_csv'
            , 'customer_id'
            , 'surname_train_csv'
            , 'credit_score_train_csv'
            , 'geography_train_csv'
            , 'gender_train_csv'
            , 'age_train_csv'
            , 'tenure_train_csv'
            , 'balance_train_csv'
            , 'product_count_train_csv'
            , 'has_creditcard_train_csv'
            , 'active_member_train_csv'
            , 'estimated_salary_train_csv'
            , 'churn_train_csv'
        ]
        self.spark_dataframes['train.csv'] = self.spark_dataframes['train.csv'].toDF(*new_column_names)
        new_column_names = [
            'rownum_Customer_Churn_Records_csv'
            , 'customer_id'
            , 'surname_Customer_Churn_Records_csv'
            , 'credit_score_Customer_Churn_Records_csv'
            , 'geography_Customer_Churn_Records_csv'
            , 'gender_Customer_Churn_Records_csv'
            , 'age_Customer_Churn_Records_csv'
            , 'tenure_Customer_Churn_Records_csv'
            , 'balance_Customer_Churn_Records_csv'
            , 'product_count_Customer_Churn_Records_csv'
            , 'has_creditcard_Customer_Churn_Records_csv'
            , 'active_member_Customer_Churn_Records_csv'
            , 'estimated_salary_Customer_Churn_Records_csv'
            , 'churn_Customer_Churn_Records_csv'
            , 'complain_Customer_Churn_Records_csv'
            , 'satisfaction_score_Customer_Churn_Records_csv'
            , 'card_type_Customer_Churn_Records_csv'
            , 'points_earned_Customer_Churn_Records_csv'
        ]
        self.spark_dataframes['Customer_Churn_Records.csv'] = \
            self.spark_dataframes['Customer_Churn_Records.csv'].toDF(*new_column_names)
        new_column_names = [
            'customer_id'
            , 'credit_score_Bank_Customer_Churn_Prediction_csv'
            , 'geography_Bank_Customer_Churn_Prediction_csv'
            , 'gender_Bank_Customer_Churn_Prediction_csv'
            , 'age_Bank_Customer_Churn_Prediction_csv'
            , 'tenure_Bank_Customer_Churn_Prediction_csv'
            , 'balance_Bank_Customer_Churn_Prediction_csv'
            , 'product_count_Bank_Customer_Churn_Prediction_csv'
            , 'has_creditcard_Bank_Customer_Churn_Prediction_csv'
            , 'active_member_Bank_Customer_Churn_Prediction_csv'
            , 'estimated_salary_Bank_Customer_Churn_Prediction_csv'
            , 'churn_Bank_Customer_Churn_Prediction_csv'
        ]
        self.spark_dataframes['Bank_Customer_Churn_Prediction.csv'] = \
            self.spark_dataframes['Bank_Customer_Churn_Prediction.csv'].toDF(*new_column_names)
        new_column_names = [
            'rownum_Churn_Modeling_csv'
            , 'customer_id'
            , 'surname_Churn_Modeling_csv'
            , 'credit_score_Churn_Modeling_csv'
            , 'geography_Churn_Modeling_csv'
            , 'gender_Churn_Modeling_csv'
            , 'age_Churn_Modeling_csv'
            , 'tenure_Churn_Modeling_csv'
            , 'balance_Churn_Modeling_csv'
            , 'product_count_Churn_Modeling_csv'
            , 'has_creditcard_Churn_Modeling_csv'
            , 'active_member_Churn_Modeling_csv'
            , 'estimated_salary_Churn_Modeling_csv'
            , 'churn_Churn_Modeling_csv'
        ]
        self.spark_dataframes['Churn_Modeling.csv'] = \
            self.spark_dataframes['Churn_Modeling.csv'].toDF(*new_column_names)
        new_column_names = [
            'rownum_Churn_Modelling_csv'
            , 'customer_id'
            , 'surname_Churn_Modelling_csv'
            , 'credit_score_Churn_Modelling_csv'
            , 'geography_Churn_Modelling_csv'
            , 'gender_Churn_Modelling_csv'
            , 'age_Churn_Modelling_csv'
            , 'tenure_Churn_Modelling_csv'
            , 'balance_Churn_Modelling_csv'
            , 'product_count_Churn_Modelling_csv'
            , 'has_creditcard_Churn_Modelling_csv'
            , 'active_member_Churn_Modelling_csv'
            , 'estimated_salary_Churn_Modelling_csv'
            , 'churn_Churn_Modelling_csv'
        ]
        self.spark_dataframes['Churn_Modelling.csv'] = \
            self.spark_dataframes['Churn_Modelling.csv'].toDF(*new_column_names)
        new_column_names = [
            'rownum_Churn_Modelling-1_csv'
            , 'customer_id'
            , 'surname_Churn_Modelling-1_csv'
            , 'credit_score_Churn_Modelling-1_csv'
            , 'geography_Churn_Modelling-1_csv'
            , 'gender_Churn_Modelling-1_csv'
            , 'age_Churn_Modelling-1_csv'
            , 'tenure_Churn_Modelling-1_csv'
            , 'balance_Churn_Modelling-1_csv'
            , 'product_count_Churn_Modelling-1_csv'
            , 'has_creditcard_Churn_Modelling-1_csv'
            , 'active_member_Churn_Modelling-1_csv'
            , 'estimated_salary_Churn_Modelling-1_csv'
            , 'churn_Churn_Modelling-1_csv'
        ]
        self.spark_dataframes['Churn_Modelling-1.csv'] = \
            self.spark_dataframes['Churn_Modelling-1.csv'].toDF(*new_column_names)
        new_column_names = [
            'rownum_churn_csv'
            , 'customer_id'
            , 'surname_churn_csv'
            , 'credit_score_churn_csv'
            , 'geography_churn_csv'
            , 'gender_churn_csv'
            , 'age_churn_csv'
            , 'tenure_churn_csv'
            , 'balance_churn_csv'
            , 'product_count_churn_csv'
            , 'has_creditcard_churn_csv'
            , 'active_member_churn_csv'
            , 'estimated_salary_churn_csv'
            , 'churn_churn_csv'
        ]
        self.spark_dataframes['churn.csv'] = self.spark_dataframes['churn.csv'].toDF(*new_column_names)
    def _transform_remove_null_id(self) -> None:
        self.spark_dataframes = \
            {k:df.filter(df.customer_id.isNotNull()) for k,df in self.spark_dataframes.copy().items()}
    def _transform_assert_unique_id(self) -> None:
        self.spark_dataframes = \
            {k:df.dropDuplicates(['customer_id']) for k,df in self.spark_dataframes.copy().items()}
        for df in self.spark_dataframes.values():
            dupe_count = df.groupBy('customer_id').count().filter('count > 1').count()
            assert dupe_count == 0, 'duplicates in the df'
    def _transform_full_outer_join(self) -> None:
        ordered_df = [df for df in self.spark_dataframes.values()]
        self.joined_df = ordered_df[0]
        for df in ordered_df[1:]:
            self.joined_df = self.joined_df.join(df, on='customer_id', how='full')
    def _transform_data_validation(self) -> None:
        all_cust_id = set(r[0] for r in self.joined_df.collect())
        _db_number = len(all_cust_id)
        _db_count = 0
        _db_percent = _db_number // 100
        _db_prog = 0
        print(_db_number)
        for cust_id in all_cust_id:
            print(1)
            cust = self.joined_df.filter(self.joined_df.customer_id == cust_id)
            print(2)
            valid = True
            cust_data = []
            for column_name in self.target_df.columns:
                columns = [c for c in self.joined_df.columns if column_name in c]
                values = set(v for v in cust.select(columns).first())
                values.add(None)
                if len(values) > 2:
                    valid = False
                elif len(values) == 2:
                    cust_data.append([v for v in values if v != None][0])
                elif len(values) == 1:
                    cust_data.append(None)
            print(3)
            if valid:
                self.target_df = self.target_df.union(spark.createDataFrame(
                    [tuple(cust_data)]
                    , schema=self.target_df.schema
                ))
            _db_count += 1
            if _db_count == _db_percent:
                _db_count = 0
                _db_prog += 1
                print(f'{_db_prog}, ', end='')

In [0]:
ignore_comment = '''
test = use for prediction (they have no labels)

rest of data merge and run 70:30 split
	train id = RowNumber (i might just drop these columns and reset them tbh)
	18col = same + 4more
	snek_case = total rename (also they only have 12 columns) (missing rownum and surname)
 
BASE 14 COLUMNS
    RowNumber (id - train.csv, not in snekcase)
    CustomerId
    Surname (not in snekcase)
    CreditScore
    Geography (country)
    Gender
    Age
    Tenure
    Balance
    NumOfProducts (products_number)
    HasCrCard (credit_card)
    IsActiveMember (active_member)
    EstimatedSalary
    Exited (churn)



df_renamed = df.withColumnRenamed("age", "user_age")
columns = ["name", "age", "city"]
df_reordered = df.select("city", "name", "age")

new_column_names = [
    "CustomerId"
    , "CreditScore"
    , "Geography"
    , "Gender"
    , "Age"
    , "Tenure"
    , "Balance"
    , "NumOfProducts"
    , "HasCrCard"
    , "IsActiveMember"
    , "EstimatedSalary"
    , "Exited"
]
df_renamed = df.toDF(*new_column_names)
'''

In [0]:
def main() -> None:
    spark = SparkSession.builder.appName("Data_Pipeline").getOrCreate()
    pipeline = DataPipeline(spark)
    pipeline.run()
    print('Script ran without crashing.')

In [0]:
if __name__ == '__main__':
    main()