In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .master("local[1]") \
      .getOrCreate() 

23/05/09 15:43:35 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/05/09 15:43:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/09 15:43:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/09 15:43:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/09 15:43:37 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
spark.read.csv('./data/1-Extrato.csv', header=True, sep=';').show(5)

+------------------------+
| Extrato Conta Corrente |
+------------------------+
|                  Conta |
|                Período |
|         Data Lançamento|
|              17/03/2023|
|              13/03/2023|
+------------------------+
only showing top 5 rows



In [3]:
from pyspark.sql.functions import monotonically_increasing_id

df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("sep", ";")
    .load("./data/1-Extrato.csv")
)

df_with_id = df.withColumn("id", monotonically_increasing_id())

df_with_id.show(5)

+------------------------+---+
| Extrato Conta Corrente | id|
+------------------------+---+
|                  Conta |  0|
|                Período |  1|
|         Data Lançamento|  2|
|              17/03/2023|  3|
|              13/03/2023|  4|
+------------------------+---+
only showing top 5 rows



In [4]:
df_filtered = df_with_id.filter(df_with_id.id >= 3)
df_filtered.show(5)

+------------------------+---+
| Extrato Conta Corrente | id|
+------------------------+---+
|              17/03/2023|  3|
|              13/03/2023|  4|
|              13/03/2023|  5|
|              07/12/2022|  6|
|              06/12/2022|  7|
+------------------------+---+
only showing top 5 rows



monotonically_increasing_id() does not guarantee consecutive values. </br>
Using this to filter out the header rows with condition like id > 2 does not ensure it is the lines that you want to be selected.

### With RDD

In [5]:
sc = spark.sparkContext
rdd = sc.textFile("./data/1-Extrato.csv")

rdd.collect()

[' Extrato Conta Corrente ',
 'Conta ;1000-0',
 'Período ;01/01/2022 a 27/04/2023',
 '',
 'Data Lançamento;Histórico;Descrição;Valor;Saldo',
 '17/03/2023;Pix enviado ;Growth Supplements P A E;-116,25',
 '13/03/2023;Cashback;Google Play;0,45',
 '13/03/2023;Compra de Giftcard;;-15,00',
 '07/12/2022;Estorno;Compra cartão;11,93',
 '06/12/2022;Compra no débito;Uber   *uber   *trip   Sao Paulo     Bra;-11,93',
 '28/11/2022;Compra no débito;Uber   *uber   *trip   Sao Paulo     Bra;-24,97',
 '11/11/2022;Compra no débito;Pezao Bar              Indaiatuba    Bra;-59,25',
 '19/10/2022;Compra no débito;Uber        *trip      Sao Paulo     Bra;-7,77',
 '19/10/2022;Compra no débito;Uber   *uber   *trip   Sao Paulo     Bra;-8,90',
 '11/10/2022;Pix enviado ;Fini Comercializadora Ltda;-31,78',
 '24/09/2022;Compra no débito;Uber   *uber   *trip   Sao Paulo     Bra;-11,68',
 '19/08/2022;Compra no débito;Bullguer Sao Paulo Bra;-63,00',
 '19/08/2022;Estorno;Compra cartão;60,00',
 '19/08/2022;Compra no débi

In [6]:
# skip the first 4 lines
header = rdd.zipWithIndex().filter(lambda x: x[1] >= 5).keys()

split_lines = header.map(lambda x: x.split(";"))

In [7]:
split_lines.take(3)

                                                                                

[['17/03/2023', 'Pix enviado ', 'Growth Supplements P A E', '-116,25'],
 ['13/03/2023', 'Cashback', 'Google Play', '0,45'],
 ['13/03/2023', 'Compra de Giftcard', '', '-15,00']]

In [8]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("data_lancamento", StringType(), True),
    StructField("historico", StringType(), True),
    StructField("descricao", StringType(), True),
    StructField("valor", StringType(), True),
])

df = spark.createDataFrame(split_lines, schema)

df.show(5)


+---------------+------------------+--------------------+-------+
|data_lancamento|         historico|           descricao|  valor|
+---------------+------------------+--------------------+-------+
|     17/03/2023|      Pix enviado |Growth Supplement...|-116,25|
|     13/03/2023|          Cashback|         Google Play|   0,45|
|     13/03/2023|Compra de Giftcard|                    | -15,00|
|     07/12/2022|           Estorno|       Compra cartão|  11,93|
|     06/12/2022|  Compra no débito|Uber   *uber   *t...| -11,93|
+---------------+------------------+--------------------+-------+
only showing top 5 rows



### In the real dataset I have one columns that doesn't have values.
so:

In [9]:
sc = spark.sparkContext
rdd = sc.textFile("./data/1-Extrato.csv")

# skip the first 4 lines
header = rdd.zipWithIndex().filter(lambda x: x[1] >= 5).keys()

split_lines = header.map(lambda x: x.split(";"))

In [10]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("data_lancamento", StringType(), True),
    StructField("historico", StringType(), True),
    StructField("descricao", StringType(), True),
    StructField("valor", StringType(), True),
    StructField("saldo", StringType(), True)
])

# create the DataFrame with null for missing fields
df = split_lines.map(lambda x: tuple(x + [None]*(len(schema.fields)-len(x))))
df = spark.createDataFrame(df, schema)

df.show(5)


+---------------+------------------+--------------------+-------+-----+
|data_lancamento|         historico|           descricao|  valor|saldo|
+---------------+------------------+--------------------+-------+-----+
|     17/03/2023|      Pix enviado |Growth Supplement...|-116,25| null|
|     13/03/2023|          Cashback|         Google Play|   0,45| null|
|     13/03/2023|Compra de Giftcard|                    | -15,00| null|
|     07/12/2022|           Estorno|       Compra cartão|  11,93| null|
|     06/12/2022|  Compra no débito|Uber   *uber   *t...| -11,93| null|
+---------------+------------------+--------------------+-------+-----+
only showing top 5 rows



DANGER! it will read everyline to apply the null's