In [0]:
from log import *
from pyspark.sql.types import *
import pandas as pd


def read_csv(source_dir,file_name,delimiter,sparksession):
    try:
        df_file_input = sparksession.read \
            .format('com.databricks.spark.csv') \
            .options(header='true', delimiter=delimiter) \
            .load(source_dir + file_name)
        syslog('Successfully read data from file : {}'.format(source_dir+file_name))
        return df_file_input
    except Exception as ex:
        sysError('Failed to read from input csv file :{}'.format(source_dir+file_name), ex)

def read_multicsv(file_list,delimiter,sparksession):
    try:
        df_file_input = sparksession.read \
            .format('com.databricks.spark.csv') \
            .options(header='true', delimiter=delimiter) \
            .load(file_list)
        syslog('Successfully read data from files : {}'.format(file_list))
        return df_file_input
    except Exception as ex:
        sysError('Failed to read from csv files :{}'.format(file_list), ex)

#read parquet file
def read_parquet(source_dir,sparksession):
    try:
        df_file_input = sparksession.read.parquet(source_dir)
        syslog('Successfully read data from parquet folder : {}'.format(source_dir))
        return df_file_input
    except Exception as ex:
        sysError('Failed to read from input parquet folder :{}'.format(source_dir), ex)

#read data from partitioned parquet file
def read_parquet_partition(source_dir,sparksession,partition):
    try:
        df_file_input = sparksession.read.parquet(source_dir+"/"+partition)
        syslog('Successfully read data from parquet folder : {}'.format(source_dir))
        return df_file_input
    except Exception as ex:
        sysError('Failed to read from input parquet folder :{}'.format(source_dir), ex)


#read excel data in pyspark dataframe
def read_excel_file_withoutSheetName(spark,input_bucket, file_name):
    try:
        file_path=input_bucket + file_name
        df_spark_df = pandas_to_spark((pd.read_excel(file_path, engine='openpyxl')), spark)
        syslog('Successfully read data from excel file : {}'.format(file_path))
        return df_spark_df
    except Exception as ex:
        sysError('Failed to read from input excel file :{}'.format(file_path), ex)

def read_excel_file_withoutSheetName_fillna(spark,input_bucket, file_name):
    try:
        file_path=input_bucket + file_name
        df_spark_df = pandas_to_spark((pd.read_excel(file_path, engine='openpyxl').fillna('')), spark)
        syslog('Successfully read data from excel file : {}'.format(file_path))
        return df_spark_df
    except Exception as ex:
        sysError('Failed to read from input excel file :{}'.format(file_path), ex)

#read excel data in pyspark dataframe
def read_excel_withfillna(spark,input_bucket, file_name, sheet_name):
    try:
        file_path=input_bucket + file_name
        df_spark_df = pandas_to_spark((pd.read_excel(file_path, sheet_name=sheet_name,
                                        engine='openpyxl').fillna('')), spark)

        syslog('Successfully read data from excel file : {}'.format(file_path))
        return df_spark_df
    except Exception as ex:
        sysError('Failed to read from input excel file :{}'.format(file_path), ex)

#Updated version of read_excel_file method which reduces number of input parameters
def read_excel(sparksession, input_bucket, file_name, sheet_name):
    try:
        source_dir = input_bucket + file_name
        df_spark_df = pandas_to_spark(pd.read_excel(input_bucket + file_name, sheet_name=sheet_name,engine='openpyxl'), sparksession)
        syslog('Successfully read data from source folder : {}'.format(source_dir))
        return df_spark_df
    except Exception as ex:
        sysError('Failed to read from input source folder :{}'.format(source_dir), ex)

def pandas_to_spark(pandas_df,spark):
    columns = list(pandas_df.columns)
    print(columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types):
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return spark.createDataFrame(pandas_df,p_schema )

def define_structure(string, format_type):
    try:
        typo = equivalent_type(format_type)
    except:
        typo = StringType()
    return StructField(string, typo)

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]':
        return TimestampType()
    elif f == 'int64':
        return LongType()
    elif f == 'int32':
        return IntegerType()
    elif f == 'float64':
        return FloatType()
    else:
        return StringType()

def read_excel_wtalrt(sparksession, input_bucket, file_name, sheet_name,alert_var):
    try:
        source_dir = input_bucket + file_name
        df_spark_df = pandas_to_spark(pd.read_excel(input_bucket + file_name, sheet_name=sheet_name,engine='openpyxl'), sparksession)
        syslog('Successfully read data from source folder : {}'.format(source_dir))
        return df_spark_df
    except Exception as ex:
        syserror_alert("Failed to read from input source folder", ex, alert_var)